]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Merge pull request #8611 from remitamine/ffmpegfd
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
6a3828fd 1from __future__ import unicode_literals
f1a9d64e 2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
1bac3455 13import math
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
799207e8 17 compat_cookies,
e9c0cdd3 18 compat_etree_fromstring,
e64b7569 19 compat_getpass,
d6983cb4 20 compat_http_client,
e9c0cdd3
YCH
21 compat_os_name,
22 compat_str,
d6983cb4 23 compat_urllib_error,
a107193e 24 compat_urllib_parse,
f0b5d6af 25 compat_urlparse,
8c25f81b
PH
26)
27from ..utils import (
c342041f 28 NO_DEFAULT,
05900629 29 age_restricted,
08f2a92c 30 bug_reports_message,
d6983cb4
PH
31 clean_html,
32 compiled_regex_type,
70f0f5a8 33 determine_ext,
9b9c5355 34 error_to_compat_str,
d6983cb4 35 ExtractorError,
97f4aecf 36 fix_xml_ampersands,
b14f3a4c 37 float_or_none,
31bb8d3f 38 int_or_none,
4ca2a3cf 39 parse_iso8601,
55b3e45b 40 RegexNotFoundError,
d41e6efc 41 sanitize_filename,
5c2266df 42 sanitized_Request,
f38de77f 43 unescapeHTML,
647eab45 44 unified_strdate,
a107193e 45 url_basename,
8d6765cf
S
46 xpath_text,
47 xpath_with_ns,
d497a201 48 determine_protocol,
1bac3455 49 parse_duration,
cafcf657 50 mimetype2ext,
cdfee168 51 update_url_query,
d6983cb4 52)
c342041f 53
d6983cb4
PH
54
55class InfoExtractor(object):
56 """Information Extractor class.
57
58 Information extractors are the classes that, given a URL, extract
59 information about the video (or videos) the URL refers to. This
60 information includes the real video URL, the video title, author and
61 others. The information is stored in a dictionary which is then
5d380852 62 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
63 information possibly downloading the video to the file system, among
64 other possible outcomes.
65
cf0649f8 66 The type field determines the type of the result.
fed5d032
PH
67 By far the most common value (and the default if _type is missing) is
68 "video", which indicates a single video.
69
70 For a video, the dictionaries must include the following fields:
d6983cb4
PH
71
72 id: Video identifier.
d6983cb4 73 title: Video title, unescaped.
d67b0b15 74
f49d89ee 75 Additionally, it must contain either a formats entry or a url one:
d67b0b15 76
f49d89ee
PH
77 formats: A list of dictionaries for each format available, ordered
78 from worst to best quality.
79
80 Potential fields:
d67b0b15 81 * url Mandatory. The URL of the video file
10952eb2 82 * ext Will be calculated from URL if missing
d67b0b15
PH
83 * format A human-readable description of the format
84 ("mp4 container with h264/opus").
85 Calculated from the format_id, width, height.
86 and format_note fields if missing.
87 * format_id A short description of the format
5d4f3985
PH
88 ("mp4_h264_opus" or "19").
89 Technically optional, but strongly recommended.
d67b0b15
PH
90 * format_note Additional info about the format
91 ("3D" or "DASH video")
92 * width Width of the video, if known
93 * height Height of the video, if known
f49d89ee 94 * resolution Textual description of width and height
7217e148 95 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
96 * abr Average audio bitrate in KBit/s
97 * acodec Name of the audio codec in use
dd27fd17 98 * asr Audio sampling rate in Hertz
d67b0b15 99 * vbr Average video bitrate in KBit/s
fbb21cf5 100 * fps Frame rate
d67b0b15 101 * vcodec Name of the video codec in use
1394ce65 102 * container Name of the container format
d67b0b15 103 * filesize The number of bytes, if known in advance
9732d77e 104 * filesize_approx An estimate for the number of bytes
d67b0b15 105 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
106 * protocol The protocol that will be used for the actual
107 download, lower-case.
b04b8852 108 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 109 "m3u8", "m3u8_native" or "http_dash_segments".
f49d89ee 110 * preference Order number of this format. If this field is
08d13955 111 present and not None, the formats get sorted
38d63d84 112 by this field, regardless of all other values.
f49d89ee
PH
113 -1 for default (order by other properties),
114 -2 or smaller for less than default.
e65566a9
PH
115 < -1000 to hide the format (if there is
116 another one which is strictly better)
32f90364
PH
117 * language Language code, e.g. "de" or "en-US".
118 * language_preference Is this in the language mentioned in
119 the URL?
aff2f4f4
PH
120 10 if it's what the URL is about,
121 -1 for default (don't know),
122 -10 otherwise, other values reserved for now.
5d73273f
PH
123 * quality Order number of the video quality of this
124 format, irrespective of the file format.
125 -1 for default (order by other properties),
126 -2 or smaller for less than default.
c64ed2a3
PH
127 * source_preference Order number for this video source
128 (quality takes higher priority)
129 -1 for default (order by other properties),
130 -2 or smaller for less than default.
d769be6c
PH
131 * http_headers A dictionary of additional HTTP headers
132 to add to the request.
6271f1ca 133 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
134 video's pixels are not square.
135 width : height ratio as float.
136 * no_resume The server does not support resuming the
137 (HTTP or RTMP) download. Boolean.
138
c0ba0f48 139 url: Final video URL.
d6983cb4 140 ext: Video filename extension.
d67b0b15
PH
141 format: The video format, defaults to ext (used for --get-format)
142 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 143
d6983cb4
PH
144 The following fields are optional:
145
f5e43bc6 146 alt_title: A secondary title of the video.
0afef30b
PH
147 display_id An alternative identifier for the video, not necessarily
148 unique, but available before title. Typically, id is
149 something like "4234987", title "Dancing naked mole rats",
150 and display_id "dancing-naked-mole-rats"
d5519808 151 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 152 * "id" (optional, string) - Thumbnail format ID
d5519808 153 * "url"
cfb56d1a 154 * "preference" (optional, int) - quality of the image
d5519808
PH
155 * "width" (optional, int)
156 * "height" (optional, int)
157 * "resolution" (optional, string "{width}x{height"},
158 deprecated)
d6983cb4 159 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 160 description: Full video description.
d6983cb4 161 uploader: Full name of the video uploader.
2bc0c46f 162 license: License name the video is licensed under.
9bb8e0a3 163 creator: The main artist who created the video.
8aab976b 164 release_date: The date (YYYYMMDD) when the video was released.
955c4514 165 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 166 upload_date: Video upload date (YYYYMMDD).
955c4514 167 If not explicitly set, calculated from timestamp.
d6983cb4 168 uploader_id: Nickname or id of the video uploader.
7bcd2830 169 uploader_url: Full URL to a personal webpage of the video uploader.
da9ec3b9 170 location: Physical location where the video was filmed.
a504ced0
JMF
171 subtitles: The available subtitles as a dictionary in the format
172 {language: subformats}. "subformats" is a list sorted from
173 lower to higher preference, each element is a dictionary
174 with the "ext" entry and one of:
175 * "data": The subtitles file contents
10952eb2 176 * "url": A URL pointing to the subtitles file
4bba3716 177 "ext" will be calculated from URL if missing
360e1ca5
JMF
178 automatic_captions: Like 'subtitles', used by the YoutubeIE for
179 automatically generated captions
62d231c0 180 duration: Length of the video in seconds, as an integer or float.
f3d29461 181 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
182 like_count: Number of positive ratings of the video
183 dislike_count: Number of negative ratings of the video
02835c6b 184 repost_count: Number of reposts of the video
2d30521a 185 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 186 comment_count: Number of comments on the video
dd622d7c
PH
187 comments: A list of comments, each with one or more of the following
188 properties (all but one of text or html optional):
189 * "author" - human-readable name of the comment author
190 * "author_id" - user ID of the comment author
191 * "id" - Comment ID
192 * "html" - Comment as HTML
193 * "text" - Plain text of the comment
194 * "timestamp" - UNIX timestamp of comment
195 * "parent" - ID of the comment this one is replying to.
196 Set to "root" to indicate that this is a
197 comment to the original video.
8dbe9899 198 age_limit: Age restriction for the video, as an integer (years)
10952eb2 199 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
200 should allow to get the same result again. (It will be set
201 by YoutubeDL if it's missing)
ad3bc6ac
PH
202 categories: A list of categories that the video falls in, for example
203 ["Sports", "Berlin"]
864f24bd 204 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
205 is_live: True, False, or None (=unknown). Whether this video is a
206 live stream that goes on instead of a fixed-length video.
7c80519c 207 start_time: Time in seconds where the reproduction should start, as
10952eb2 208 specified in the URL.
297a564b 209 end_time: Time in seconds where the reproduction should end, as
10952eb2 210 specified in the URL.
d6983cb4 211
7109903e
S
212 The following fields should only be used when the video belongs to some logical
213 chapter or section:
214
215 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
216 chapter_number: Number of the chapter the video belongs to, as an integer.
217 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
218
219 The following fields should only be used when the video is an episode of some
220 series or programme:
221
222 series: Title of the series or programme the video episode belongs to.
223 season: Title of the season the video episode belongs to.
27bfd4e5
S
224 season_number: Number of the season the video episode belongs to, as an integer.
225 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
226 episode: Title of the video episode. Unlike mandatory video title field,
227 this field should denote the exact title of the video episode
228 without any kind of decoration.
27bfd4e5
S
229 episode_number: Number of the video episode within a season, as an integer.
230 episode_id: Id of the video episode, as a unicode string.
7109903e 231
deefc05b 232 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 233
d838b1bd
PH
234 Unless mentioned otherwise, None is equivalent to absence of information.
235
fed5d032
PH
236
237 _type "playlist" indicates multiple videos.
b82f815f
PH
238 There must be a key "entries", which is a list, an iterable, or a PagedList
239 object, each element of which is a valid dictionary by this specification.
fed5d032 240
e0b9d78f
S
241 Additionally, playlists can have "title", "description" and "id" attributes
242 with the same semantics as videos (see above).
fed5d032
PH
243
244
245 _type "multi_video" indicates that there are multiple videos that
246 form a single show, for examples multiple acts of an opera or TV episode.
247 It must have an entries key like a playlist and contain all the keys
248 required for a video at the same time.
249
250
251 _type "url" indicates that the video must be extracted from another
252 location, possibly by a different extractor. Its only required key is:
253 "url" - the next URL to extract.
f58766ce
PH
254 The key "ie_key" can be set to the class name (minus the trailing "IE",
255 e.g. "Youtube") if the extractor class is known in advance.
256 Additionally, the dictionary may have any properties of the resolved entity
257 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
258 known ahead of time.
259
260
261 _type "url_transparent" entities have the same specification as "url", but
262 indicate that the given additional information is more precise than the one
263 associated with the resolved URL.
264 This is useful when a site employs a video service that hosts the video and
265 its technical metadata, but that video service does not embed a useful
266 title, description etc.
267
268
d6983cb4
PH
269 Subclasses of this one should re-define the _real_initialize() and
270 _real_extract() methods and define a _VALID_URL regexp.
271 Probably, they should also be added to the list of extractors.
272
d6983cb4
PH
273 Finally, the _WORKING attribute should be set to False for broken IEs
274 in order to warn the users and skip the tests.
275 """
276
277 _ready = False
278 _downloader = None
279 _WORKING = True
280
281 def __init__(self, downloader=None):
282 """Constructor. Receives an optional downloader."""
283 self._ready = False
284 self.set_downloader(downloader)
285
286 @classmethod
287 def suitable(cls, url):
288 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
289
290 # This does not use has/getattr intentionally - we want to know whether
291 # we have cached the regexp for *this* class, whereas getattr would also
292 # match the superclass
293 if '_VALID_URL_RE' not in cls.__dict__:
294 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
295 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 296
ed9266db
PH
297 @classmethod
298 def _match_id(cls, url):
299 if '_VALID_URL_RE' not in cls.__dict__:
300 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
301 m = cls._VALID_URL_RE.match(url)
302 assert m
303 return m.group('id')
304
d6983cb4
PH
305 @classmethod
306 def working(cls):
307 """Getter method for _WORKING."""
308 return cls._WORKING
309
310 def initialize(self):
311 """Initializes an instance (authentication, etc)."""
312 if not self._ready:
313 self._real_initialize()
314 self._ready = True
315
316 def extract(self, url):
317 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
318 try:
319 self.initialize()
320 return self._real_extract(url)
321 except ExtractorError:
322 raise
323 except compat_http_client.IncompleteRead as e:
dfb1b146 324 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 325 except (KeyError, StopIteration) as e:
dfb1b146 326 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4
PH
327
328 def set_downloader(self, downloader):
329 """Sets the downloader for this IE."""
330 self._downloader = downloader
331
332 def _real_initialize(self):
333 """Real initialization process. Redefine in subclasses."""
334 pass
335
336 def _real_extract(self, url):
337 """Real extraction process. Redefine in subclasses."""
338 pass
339
56c73665
JMF
340 @classmethod
341 def ie_key(cls):
342 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 343 return compat_str(cls.__name__[:-2])
56c73665 344
d6983cb4
PH
345 @property
346 def IE_NAME(self):
dc519b54 347 return compat_str(type(self).__name__[:-2])
d6983cb4 348
cdfee168 349 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
d6983cb4
PH
350 """ Returns the response handle """
351 if note is None:
352 self.report_download_webpage(video_id)
353 elif note is not False:
7cc3570e 354 if video_id is None:
f1a9d64e 355 self.to_screen('%s' % (note,))
7cc3570e 356 else:
f1a9d64e 357 self.to_screen('%s: %s' % (video_id, note))
cdfee168 358 # data, headers and query params will be ignored for `Request` objects
359 if isinstance(url_or_request, compat_str):
360 if query:
361 url_or_request = update_url_query(url_or_request, query)
362 if data or headers:
363 url_or_request = sanitized_Request(url_or_request, data, headers or {})
d6983cb4 364 try:
dca08720 365 return self._downloader.urlopen(url_or_request)
d6983cb4 366 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
367 if errnote is False:
368 return False
d6983cb4 369 if errnote is None:
f1a9d64e 370 errnote = 'Unable to download webpage'
7f8b2714 371
9b9c5355 372 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
373 if fatal:
374 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
375 else:
376 self._downloader.report_warning(errmsg)
377 return False
d6983cb4 378
cdfee168 379 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
d6983cb4 380 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
381 # Strip hashes from the URL (#1038)
382 if isinstance(url_or_request, (compat_str, str)):
383 url_or_request = url_or_request.partition('#')[0]
384
cdfee168 385 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
7cc3570e
PH
386 if urlh is False:
387 assert not fatal
388 return False
c9a77969 389 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
390 return (content, urlh)
391
c9a77969
YCH
392 @staticmethod
393 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
394 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
395 if m:
396 encoding = m.group(1)
397 else:
0d75ae2c 398 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
399 webpage_bytes[:1024])
400 if m:
401 encoding = m.group(1).decode('ascii')
b60016e8
PH
402 elif webpage_bytes.startswith(b'\xff\xfe'):
403 encoding = 'utf-16'
f143d86a
PH
404 else:
405 encoding = 'utf-8'
c9a77969
YCH
406
407 return encoding
408
409 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
410 content_type = urlh.headers.get('Content-Type', '')
411 webpage_bytes = urlh.read()
412 if prefix is not None:
413 webpage_bytes = prefix + webpage_bytes
414 if not encoding:
415 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
416 if self._downloader.params.get('dump_intermediate_pages', False):
417 try:
418 url = url_or_request.get_full_url()
419 except AttributeError:
420 url = url_or_request
f1a9d64e 421 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
422 dump = base64.b64encode(webpage_bytes).decode('ascii')
423 self._downloader.to_screen(dump)
d41e6efc
PH
424 if self._downloader.params.get('write_pages', False):
425 try:
426 url = url_or_request.get_full_url()
427 except AttributeError:
428 url = url_or_request
5afa7f8b 429 basen = '%s_%s' % (video_id, url)
c1bce22f 430 if len(basen) > 240:
f1a9d64e 431 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
432 basen = basen[:240 - len(h)] + h
433 raw_filename = basen + '.dump'
d41e6efc 434 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 435 self.to_screen('Saving request to ' + filename)
5f58165d
S
436 # Working around MAX_PATH limitation on Windows (see
437 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 438 if compat_os_name == 'nt':
5f58165d
S
439 absfilepath = os.path.abspath(filename)
440 if len(absfilepath) > 259:
441 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
442 with open(filename, 'wb') as outf:
443 outf.write(webpage_bytes)
444
ec0fafbb
AA
445 try:
446 content = webpage_bytes.decode(encoding, 'replace')
447 except LookupError:
448 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 449
f1a9d64e
PH
450 if ('<title>Access to this site is blocked</title>' in content and
451 'Websense' in content[:512]):
452 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
453 blocked_iframe = self._html_search_regex(
454 r'<iframe src="([^"]+)"', content,
f1a9d64e 455 'Websense information URL', default=None)
2410c43d 456 if blocked_iframe:
f1a9d64e 457 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 458 raise ExtractorError(msg, expected=True)
77b2986b
PH
459 if '<title>The URL you requested has been blocked</title>' in content[:512]:
460 msg = (
461 'Access to this webpage has been blocked by Indian censorship. '
462 'Use a VPN or proxy server (with --proxy) to route around it.')
463 block_msg = self._html_search_regex(
464 r'</h1><p>(.*?)</p>',
465 content, 'block message', default=None)
466 if block_msg:
467 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
468 raise ExtractorError(msg, expected=True)
2410c43d 469
23be51d8 470 return content
d6983cb4 471
cdfee168 472 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
d6983cb4 473 """ Returns the data of the page as a string """
995ad69c
TF
474 success = False
475 try_count = 0
476 while success is False:
477 try:
cdfee168 478 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
995ad69c
TF
479 success = True
480 except compat_http_client.IncompleteRead as e:
481 try_count += 1
482 if try_count >= tries:
483 raise e
484 self._sleep(timeout, video_id)
7cc3570e
PH
485 if res is False:
486 return res
487 else:
488 content, _ = res
489 return content
d6983cb4 490
2a275ab0 491 def _download_xml(self, url_or_request, video_id,
f1a9d64e 492 note='Downloading XML', errnote='Unable to download XML',
cdfee168 493 transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
267ed0c5 494 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 495 xml_string = self._download_webpage(
cdfee168 496 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
28746fbd
PH
497 if xml_string is False:
498 return xml_string
e2b38da9
PH
499 if transform_source:
500 xml_string = transform_source(xml_string)
36e6f62c 501 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 502
3d3538e4 503 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
504 note='Downloading JSON metadata',
505 errnote='Unable to download JSON metadata',
b090af59 506 transform_source=None,
cdfee168 507 fatal=True, encoding=None, data=None, headers=None, query=None):
b090af59 508 json_string = self._download_webpage(
c9a77969 509 url_or_request, video_id, note, errnote, fatal=fatal,
cdfee168 510 encoding=encoding, data=data, headers=headers, query=query)
b090af59
PH
511 if (not fatal) and json_string is False:
512 return None
ebb64199
TF
513 return self._parse_json(
514 json_string, video_id, transform_source=transform_source, fatal=fatal)
515
516 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
517 if transform_source:
518 json_string = transform_source(json_string)
3d3538e4
PH
519 try:
520 return json.loads(json_string)
521 except ValueError as ve:
e7b6d122
PH
522 errmsg = '%s: Failed to parse JSON ' % video_id
523 if fatal:
524 raise ExtractorError(errmsg, cause=ve)
525 else:
526 self.report_warning(errmsg + str(ve))
3d3538e4 527
f45f96f8 528 def report_warning(self, msg, video_id=None):
f1a9d64e 529 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 530 self._downloader.report_warning(
f1a9d64e 531 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 532
d6983cb4
PH
533 def to_screen(self, msg):
534 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 535 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
536
537 def report_extraction(self, id_or_name):
538 """Report information extraction."""
f1a9d64e 539 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
540
541 def report_download_webpage(self, video_id):
542 """Report webpage download."""
f1a9d64e 543 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
544
545 def report_age_confirmation(self):
546 """Report attempt to confirm age."""
f1a9d64e 547 self.to_screen('Confirming age')
d6983cb4 548
fc79158d
JMF
549 def report_login(self):
550 """Report attempt to log in."""
f1a9d64e 551 self.to_screen('Logging in')
fc79158d 552
43e7d3c9
S
553 @staticmethod
554 def raise_login_required(msg='This video is only available for registered users'):
555 raise ExtractorError(
556 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
557 expected=True)
558
c430802e
S
559 @staticmethod
560 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
561 raise ExtractorError(
562 '%s. You might want to use --proxy to workaround.' % msg,
563 expected=True)
564
5f6a1245 565 # Methods for following #608
c0d0b01f 566 @staticmethod
830d53bf 567 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 568 """Returns a URL that points to a page that should be processed"""
5f6a1245 569 # TODO: ie should be the class used for getting the info
d6983cb4
PH
570 video_info = {'_type': 'url',
571 'url': url,
572 'ie_key': ie}
7012b23c
PH
573 if video_id is not None:
574 video_info['id'] = video_id
830d53bf
S
575 if video_title is not None:
576 video_info['title'] = video_title
d6983cb4 577 return video_info
5f6a1245 578
c0d0b01f 579 @staticmethod
acf5cbfe 580 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
581 """Returns a playlist"""
582 video_info = {'_type': 'playlist',
583 'entries': entries}
584 if playlist_id:
585 video_info['id'] = playlist_id
586 if playlist_title:
587 video_info['title'] = playlist_title
acf5cbfe
S
588 if playlist_description:
589 video_info['description'] = playlist_description
d6983cb4
PH
590 return video_info
591
c342041f 592 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
593 """
594 Perform a regex search on the given string, using a single or a list of
595 patterns returning the first matching group.
596 In case of failure return a default value or raise a WARNING or a
55b3e45b 597 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
598 """
599 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
600 mobj = re.search(pattern, string, flags)
601 else:
602 for p in pattern:
603 mobj = re.search(p, string, flags)
c3415d1b
PH
604 if mobj:
605 break
d6983cb4 606
e9c0cdd3 607 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 608 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
609 else:
610 _name = name
611
612 if mobj:
711ede6e
PH
613 if group is None:
614 # return the first matching group
615 return next(g for g in mobj.groups() if g is not None)
616 else:
617 return mobj.group(group)
c342041f 618 elif default is not NO_DEFAULT:
d6983cb4
PH
619 return default
620 elif fatal:
f1a9d64e 621 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 622 else:
08f2a92c 623 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
624 return None
625
c342041f 626 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
627 """
628 Like _search_regex, but strips HTML tags and unescapes entities.
629 """
711ede6e 630 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
631 if res:
632 return clean_html(res).strip()
633 else:
634 return res
635
fc79158d
JMF
636 def _get_login_info(self):
637 """
cf0649f8 638 Get the login info as (username, password)
fc79158d
JMF
639 It will look in the netrc file using the _NETRC_MACHINE value
640 If there's no info available, return (None, None)
641 """
642 if self._downloader is None:
643 return (None, None)
644
645 username = None
646 password = None
647 downloader_params = self._downloader.params
648
649 # Attempt to use provided username and password or .netrc data
d800609c 650 if downloader_params.get('username') is not None:
fc79158d
JMF
651 username = downloader_params['username']
652 password = downloader_params['password']
653 elif downloader_params.get('usenetrc', False):
654 try:
655 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
656 if info is not None:
657 username = info[0]
658 password = info[2]
659 else:
660 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
661 except (IOError, netrc.NetrcParseError) as err:
9b9c5355 662 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
5f6a1245 663
fc79158d
JMF
664 return (username, password)
665
e64b7569 666 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 667 """
668 Get the two-factor authentication info
669 TODO - asking the user will be required for sms/phone verify
670 currently just uses the command line option
671 If there's no info available, return None
672 """
673 if self._downloader is None:
83317f69 674 return None
675 downloader_params = self._downloader.params
676
d800609c 677 if downloader_params.get('twofactor') is not None:
83317f69 678 return downloader_params['twofactor']
679
e64b7569 680 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 681
46720279
JMF
682 # Helper functions for extracting OpenGraph info
683 @staticmethod
ab2d5247 684 def _og_regexes(prop):
448ef1f3 685 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
686 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
687 % {'prop': re.escape(prop)})
78fb87b2 688 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 689 return [
78fb87b2
JMF
690 template % (property_re, content_re),
691 template % (content_re, property_re),
ab2d5247 692 ]
46720279 693
864f24bd
S
694 @staticmethod
695 def _meta_regex(prop):
696 return r'''(?isx)<meta
8b9848ac 697 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
698 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
699
3c4e6d83 700 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 701 if name is None:
3c4e6d83 702 name = 'OpenGraph %s' % prop
ab2d5247 703 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
704 if escaped is None:
705 return None
706 return unescapeHTML(escaped)
46720279
JMF
707
708 def _og_search_thumbnail(self, html, **kargs):
10952eb2 709 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
710
711 def _og_search_description(self, html, **kargs):
712 return self._og_search_property('description', html, fatal=False, **kargs)
713
714 def _og_search_title(self, html, **kargs):
715 return self._og_search_property('title', html, **kargs)
716
8ffa13e0 717 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
718 regexes = self._og_regexes('video') + self._og_regexes('video:url')
719 if secure:
720 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 721 return self._html_search_regex(regexes, html, name, **kargs)
46720279 722
78338f71
JMF
723 def _og_search_url(self, html, **kargs):
724 return self._og_search_property('url', html, **kargs)
725
40c696e5 726 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
727 if display_name is None:
728 display_name = name
729 return self._html_search_regex(
864f24bd 730 self._meta_regex(name),
711ede6e 731 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
732
733 def _dc_search_uploader(self, html):
734 return self._html_search_meta('dc.creator', html, 'uploader')
735
8dbe9899
PH
736 def _rta_search(self, html):
737 # See http://www.rtalabel.org/index.php?content=howtofaq#single
738 if re.search(r'(?ix)<meta\s+name="rating"\s+'
739 r' content="RTA-5042-1996-1400-1577-RTA"',
740 html):
741 return 18
742 return 0
743
59040888
PH
744 def _media_rating_search(self, html):
745 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
746 rating = self._html_search_meta('rating', html)
747
748 if not rating:
749 return None
750
751 RATING_TABLE = {
752 'safe for kids': 0,
753 'general': 8,
754 '14 years': 14,
755 'mature': 17,
756 'restricted': 19,
757 }
d800609c 758 return RATING_TABLE.get(rating.lower())
59040888 759
69319969 760 def _family_friendly_search(self, html):
6ca7732d 761 # See http://schema.org/VideoObject
69319969
NJ
762 family_friendly = self._html_search_meta('isFamilyFriendly', html)
763
764 if not family_friendly:
765 return None
766
767 RATING_TABLE = {
768 '1': 0,
769 'true': 0,
770 '0': 18,
771 'false': 18,
772 }
d800609c 773 return RATING_TABLE.get(family_friendly.lower())
69319969 774
0c708f11
JMF
775 def _twitter_search_player(self, html):
776 return self._html_search_meta('twitter:player', html,
9e1a5b84 777 'twitter card player')
0c708f11 778
0b26ba3f 779 def _search_json_ld(self, html, video_id, **kwargs):
4ca2a3cf
S
780 json_ld = self._search_regex(
781 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 782 html, 'JSON-LD', group='json_ld', **kwargs)
4ca2a3cf
S
783 if not json_ld:
784 return {}
0b26ba3f 785 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
4ca2a3cf
S
786
787 def _json_ld(self, json_ld, video_id, fatal=True):
788 if isinstance(json_ld, compat_str):
789 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
790 if not json_ld:
791 return {}
792 info = {}
793 if json_ld.get('@context') == 'http://schema.org':
794 item_type = json_ld.get('@type')
795 if item_type == 'TVEpisode':
796 info.update({
797 'episode': unescapeHTML(json_ld.get('name')),
798 'episode_number': int_or_none(json_ld.get('episodeNumber')),
799 'description': unescapeHTML(json_ld.get('description')),
800 })
801 part_of_season = json_ld.get('partOfSeason')
802 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
803 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
804 part_of_series = json_ld.get('partOfSeries')
805 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
806 info['series'] = unescapeHTML(part_of_series.get('name'))
807 elif item_type == 'Article':
808 info.update({
809 'timestamp': parse_iso8601(json_ld.get('datePublished')),
810 'title': unescapeHTML(json_ld.get('headline')),
811 'description': unescapeHTML(json_ld.get('articleBody')),
812 })
813 return dict((k, v) for k, v in info.items() if v is not None)
814
27713812 815 @staticmethod
f8da79f8 816 def _hidden_inputs(html):
586f1cc5 817 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 818 hidden_inputs = {}
73eb13df 819 for input in re.findall(r'(?i)<input([^>]+)>', html):
be0e5dbd 820 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
201ea3ee
S
821 continue
822 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
823 if not name:
824 continue
825 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
826 if not value:
827 continue
828 hidden_inputs[name.group('value')] = value.group('value')
829 return hidden_inputs
27713812 830
cf61d96d
S
831 def _form_hidden_inputs(self, form_id, html):
832 form = self._search_regex(
73eb13df 833 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
834 html, '%s form' % form_id, group='form')
835 return self._hidden_inputs(form)
836
3ded7bac 837 def _sort_formats(self, formats, field_preference=None):
7e8caf30 838 if not formats:
f1a9d64e 839 raise ExtractorError('No video formats found')
7e8caf30 840
b0d21ded
S
841 for f in formats:
842 # Automatically determine tbr when missing based on abr and vbr (improves
843 # formats sorting in some cases)
350cf045 844 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
845 f['tbr'] = f['abr'] + f['vbr']
846
4bcc7bd1 847 def _formats_key(f):
e6812ac9
PH
848 # TODO remove the following workaround
849 from ..utils import determine_ext
850 if not f.get('ext') and 'url' in f:
851 f['ext'] = determine_ext(f['url'])
852
3ded7bac
S
853 if isinstance(field_preference, (list, tuple)):
854 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
855
4bcc7bd1
PH
856 preference = f.get('preference')
857 if preference is None:
d497a201 858 preference = 0
4bcc7bd1
PH
859 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
860 preference -= 0.5
861
d497a201 862 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
863
4bcc7bd1
PH
864 if f.get('vcodec') == 'none': # audio only
865 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 866 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 867 else:
f1a9d64e 868 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
869 ext_preference = 0
870 try:
871 audio_ext_preference = ORDER.index(f['ext'])
872 except ValueError:
873 audio_ext_preference = -1
874 else:
875 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 876 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 877 else:
f1a9d64e 878 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
879 try:
880 ext_preference = ORDER.index(f['ext'])
881 except ValueError:
882 ext_preference = -1
883 audio_ext_preference = 0
884
885 return (
886 preference,
aff2f4f4 887 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 888 f.get('quality') if f.get('quality') is not None else -1,
9933b574 889 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 890 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 891 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
892 f.get('height') if f.get('height') is not None else -1,
893 f.get('width') if f.get('width') is not None else -1,
d497a201 894 proto_preference,
1e1896f2 895 ext_preference,
4bcc7bd1
PH
896 f.get('abr') if f.get('abr') is not None else -1,
897 audio_ext_preference,
2c8e03d9 898 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 899 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 900 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 901 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
902 )
903 formats.sort(key=_formats_key)
59040888 904
96a53167
S
905 def _check_formats(self, formats, video_id):
906 if formats:
907 formats[:] = filter(
908 lambda f: self._is_valid_url(
909 f['url'], video_id,
910 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
911 formats)
912
f5bdb444
S
913 @staticmethod
914 def _remove_duplicate_formats(formats):
915 format_urls = set()
916 unique_formats = []
917 for f in formats:
918 if f['url'] not in format_urls:
919 format_urls.add(f['url'])
920 unique_formats.append(f)
921 formats[:] = unique_formats
922
96a53167 923 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
924 url = self._proto_relative_url(url, scheme='http:')
925 # For now assume non HTTP(S) URLs always valid
926 if not (url.startswith('http://') or url.startswith('https://')):
927 return True
96a53167 928 try:
4069766c 929 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
930 return True
931 except ExtractorError as e:
943a1e24 932 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
933 self.to_screen(
934 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
935 return False
936 raise
937
20991253 938 def http_scheme(self):
1ede5b24 939 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
940 return (
941 'http:'
942 if self._downloader.params.get('prefer_insecure', False)
943 else 'https:')
944
57c7411f
PH
945 def _proto_relative_url(self, url, scheme=None):
946 if url is None:
947 return url
948 if url.startswith('//'):
949 if scheme is None:
950 scheme = self.http_scheme()
951 return scheme + url
952 else:
953 return url
954
4094b6e3
PH
955 def _sleep(self, timeout, video_id, msg_template=None):
956 if msg_template is None:
f1a9d64e 957 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
958 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
959 self.to_screen(msg)
960 time.sleep(timeout)
961
a38436e8 962 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310
S
963 transform_source=lambda s: fix_xml_ampersands(s).strip(),
964 fatal=True):
f036a632
JMF
965 manifest = self._download_xml(
966 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
967 'Unable to download f4m manifest',
968 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
969 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
970 transform_source=transform_source,
971 fatal=fatal)
972
973 if manifest is False:
8d29e47f 974 return []
31bb8d3f 975
0fdbb332
S
976 return self._parse_f4m_formats(
977 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
978 transform_source=transform_source, fatal=fatal)
979
980 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
981 transform_source=lambda s: fix_xml_ampersands(s).strip(),
982 fatal=True):
31bb8d3f 983 formats = []
7a47d07c 984 manifest_version = '1.0'
b2527359 985 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 986 if not media_nodes:
7a47d07c 987 manifest_version = '2.0'
34e48bed 988 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
019839fa
S
989 base_url = xpath_text(
990 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
991 'base URL', default=None)
992 if base_url:
993 base_url = base_url.strip()
b2527359 994 for i, media_el in enumerate(media_nodes):
7a47d07c 995 if manifest_version == '2.0':
31c746e5
S
996 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
997 if not media_url:
998 continue
cc357c4d
S
999 manifest_url = (
1000 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 1001 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1002 # If media_url is itself a f4m manifest do the recursive extraction
1003 # since bitrates in parent manifest (this one) and media_url manifest
1004 # may differ leading to inability to resolve the format by requested
1005 # bitrate in f4m downloader
1006 if determine_ext(manifest_url) == 'f4m':
7e5edcfd 1007 formats.extend(self._extract_f4m_formats(
0fdbb332
S
1008 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1009 transform_source=transform_source, fatal=fatal))
70f0f5a8 1010 continue
b2527359 1011 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 1012 formats.append({
e21a55ab 1013 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
1014 'url': manifest_url,
1015 'ext': 'flv',
b2527359 1016 'tbr': tbr,
31bb8d3f
JMF
1017 'width': int_or_none(media_el.attrib.get('width')),
1018 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 1019 'preference': preference,
31bb8d3f
JMF
1020 })
1021 self._sort_formats(formats)
1022
1023 return formats
1024
f0b5d6af 1025 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 1026 entry_protocol='m3u8', preference=None,
13af92fd
YCH
1027 m3u8_id=None, note=None, errnote=None,
1028 fatal=True):
f0b5d6af 1029
704df56d 1030 formats = [{
f207019c 1031 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1032 'url': m3u8_url,
1033 'ext': ext,
1034 'protocol': 'm3u8',
9fe6ef7a 1035 'preference': preference - 1 if preference else -1,
704df56d
PH
1036 'resolution': 'multiple',
1037 'format_note': 'Quality selection URL',
1038 }]
1039
f0b5d6af
PH
1040 format_url = lambda u: (
1041 u
1042 if re.match(r'^https?://', u)
1043 else compat_urlparse.urljoin(m3u8_url, u))
1044
dbd82a1d 1045 res = self._download_webpage_handle(
81515ad9 1046 m3u8_url, video_id,
621ed9f5 1047 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1048 errnote=errnote or 'Failed to download m3u8 information',
1049 fatal=fatal)
dbd82a1d 1050 if res is False:
8d29e47f 1051 return []
dbd82a1d 1052 m3u8_doc, urlh = res
37113045 1053 m3u8_url = urlh.geturl()
9cdffeeb
S
1054
1055 # We should try extracting formats only from master playlists [1], i.e.
1056 # playlists that describe available qualities. On the other hand media
1057 # playlists [2] should be returned as is since they contain just the media
1058 # without qualities renditions.
1059 # Fortunately, master playlist can be easily distinguished from media
1060 # playlist based on particular tags availability. As of [1, 2] master
1061 # playlist tags MUST NOT appear in a media playist and vice versa.
1062 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1063 # and MUST NOT appear in master playlist thus we can clearly detect media
1064 # playlist with this criterion.
1065 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1066 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1067 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1068 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1069 return [{
1070 'url': m3u8_url,
1071 'format_id': m3u8_id,
1072 'ext': ext,
1073 'protocol': entry_protocol,
1074 'preference': preference,
1075 }]
704df56d 1076 last_info = None
fa156077 1077 last_media = None
704df56d
PH
1078 kv_rex = re.compile(
1079 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1080 for line in m3u8_doc.splitlines():
1081 if line.startswith('#EXT-X-STREAM-INF:'):
1082 last_info = {}
1083 for m in kv_rex.finditer(line):
1084 v = m.group('val')
1085 if v.startswith('"'):
1086 v = v[1:-1]
1087 last_info[m.group('key')] = v
4cd95bcb
JMF
1088 elif line.startswith('#EXT-X-MEDIA:'):
1089 last_media = {}
1090 for m in kv_rex.finditer(line):
1091 v = m.group('val')
1092 if v.startswith('"'):
1093 v = v[1:-1]
1094 last_media[m.group('key')] = v
704df56d
PH
1095 elif line.startswith('#') or not line.strip():
1096 continue
1097 else:
daebaab6 1098 if last_info is None:
f0b5d6af 1099 formats.append({'url': format_url(line)})
3524cc25 1100 continue
704df56d 1101 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1102 format_id = []
1103 if m3u8_id:
1104 format_id.append(m3u8_id)
05d5392c 1105 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 1106 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 1107 f = {
8dc9d361 1108 'format_id': '-'.join(format_id),
f0b5d6af 1109 'url': format_url(line.strip()),
704df56d
PH
1110 'tbr': tbr,
1111 'ext': ext,
f0b5d6af
PH
1112 'protocol': entry_protocol,
1113 'preference': preference,
704df56d 1114 }
704df56d
PH
1115 resolution = last_info.get('RESOLUTION')
1116 if resolution:
1117 width_str, height_str = resolution.split('x')
1118 f['width'] = int(width_str)
1119 f['height'] = int(height_str)
fbb6edd2
S
1120 codecs = last_info.get('CODECS')
1121 if codecs:
1122 vcodec, acodec = [None] * 2
1123 va_codecs = codecs.split(',')
1124 if len(va_codecs) == 1:
1125 # Audio only entries usually come with single codec and
1126 # no resolution. For more robustness we also check it to
1127 # be mp4 audio.
1128 if not resolution and va_codecs[0].startswith('mp4a'):
1129 vcodec, acodec = 'none', va_codecs[0]
1130 else:
1131 vcodec = va_codecs[0]
1132 else:
1133 vcodec, acodec = va_codecs[:2]
1134 f.update({
1135 'acodec': acodec,
1136 'vcodec': vcodec,
1137 })
4cd95bcb
JMF
1138 if last_media is not None:
1139 f['m3u8_media'] = last_media
1140 last_media = None
704df56d
PH
1141 formats.append(f)
1142 last_info = {}
1143 self._sort_formats(formats)
1144 return formats
1145
a107193e
S
1146 @staticmethod
1147 def _xpath_ns(path, namespace=None):
1148 if not namespace:
1149 return path
1150 out = []
1151 for c in path.split('/'):
1152 if not c or c == '.':
1153 out.append(c)
1154 else:
1155 out.append('{%s}%s' % (namespace, c))
1156 return '/'.join(out)
1157
09f572fb 1158 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1159 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1160
995029a1
PH
1161 if smil is False:
1162 assert not fatal
1163 return []
e89a2aab 1164
17712eeb 1165 namespace = self._parse_smil_namespace(smil)
a107193e
S
1166
1167 return self._parse_smil_formats(
1168 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1169
1170 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1171 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1172 if smil is False:
1173 return {}
1174 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1175
09f572fb 1176 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1177 return self._download_xml(
1178 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1179 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1180
1181 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1182 namespace = self._parse_smil_namespace(smil)
a107193e
S
1183
1184 formats = self._parse_smil_formats(
1185 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1186 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1187
1188 video_id = os.path.splitext(url_basename(smil_url))[0]
1189 title = None
1190 description = None
647eab45 1191 upload_date = None
a107193e
S
1192 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1193 name = meta.attrib.get('name')
1194 content = meta.attrib.get('content')
1195 if not name or not content:
1196 continue
1197 if not title and name == 'title':
1198 title = content
1199 elif not description and name in ('description', 'abstract'):
1200 description = content
647eab45
S
1201 elif not upload_date and name == 'date':
1202 upload_date = unified_strdate(content)
a107193e 1203
1e5bcdec
S
1204 thumbnails = [{
1205 'id': image.get('type'),
1206 'url': image.get('src'),
1207 'width': int_or_none(image.get('width')),
1208 'height': int_or_none(image.get('height')),
1209 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1210
a107193e
S
1211 return {
1212 'id': video_id,
1213 'title': title or video_id,
1214 'description': description,
647eab45 1215 'upload_date': upload_date,
1e5bcdec 1216 'thumbnails': thumbnails,
a107193e
S
1217 'formats': formats,
1218 'subtitles': subtitles,
1219 }
1220
17712eeb
S
1221 def _parse_smil_namespace(self, smil):
1222 return self._search_regex(
1223 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1224
f877c6ae 1225 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1226 base = smil_url
1227 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1228 b = meta.get('base') or meta.get('httpBase')
1229 if b:
1230 base = b
1231 break
e89a2aab
S
1232
1233 formats = []
1234 rtmp_count = 0
a107193e 1235 http_count = 0
7f32e5dc 1236 m3u8_count = 0
a107193e 1237
81e1c4e2 1238 srcs = []
a107193e
S
1239 videos = smil.findall(self._xpath_ns('.//video', namespace))
1240 for video in videos:
1241 src = video.get('src')
81e1c4e2 1242 if not src or src in srcs:
a107193e 1243 continue
81e1c4e2 1244 srcs.append(src)
a107193e 1245
e7d8e98a 1246 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
a107193e
S
1247 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1248 width = int_or_none(video.get('width'))
1249 height = int_or_none(video.get('height'))
1250 proto = video.get('proto')
1251 ext = video.get('ext')
1252 src_ext = determine_ext(src)
1253 streamer = video.get('streamer') or base
1254
1255 if proto == 'rtmp' or streamer.startswith('rtmp'):
1256 rtmp_count += 1
1257 formats.append({
1258 'url': streamer,
1259 'play_path': src,
1260 'ext': 'flv',
1261 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1262 'tbr': bitrate,
1263 'filesize': filesize,
1264 'width': width,
1265 'height': height,
1266 })
f877c6ae
YCH
1267 if transform_rtmp_url:
1268 streamer, src = transform_rtmp_url(streamer, src)
1269 formats[-1].update({
1270 'url': streamer,
1271 'play_path': src,
1272 })
a107193e
S
1273 continue
1274
1275 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1276 src_url = src_url.strip()
a107193e
S
1277
1278 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1279 m3u8_formats = self._extract_m3u8_formats(
1280 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1281 if len(m3u8_formats) == 1:
1282 m3u8_count += 1
1283 m3u8_formats[0].update({
1284 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1285 'tbr': bitrate,
1286 'width': width,
1287 'height': height,
1288 })
1289 formats.extend(m3u8_formats)
a107193e
S
1290 continue
1291
1292 if src_ext == 'f4m':
1293 f4m_url = src_url
1294 if not f4m_params:
1295 f4m_params = {
1296 'hdcore': '3.2.0',
1297 'plugin': 'flowplayer-3.2.0.1',
1298 }
1299 f4m_url += '&' if '?' in f4m_url else '?'
41c3a5a7 1300 f4m_url += compat_urllib_parse.urlencode(f4m_params)
7e5edcfd 1301 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1302 continue
1303
c78e4817 1304 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1305 http_count += 1
1306 formats.append({
1307 'url': src_url,
1308 'ext': ext or src_ext or 'flv',
1309 'format_id': 'http-%d' % (bitrate or http_count),
1310 'tbr': bitrate,
1311 'filesize': filesize,
1312 'width': width,
1313 'height': height,
1314 })
1315 continue
63757032 1316
e89a2aab
S
1317 self._sort_formats(formats)
1318
1319 return formats
1320
ce00af87 1321 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1322 urls = []
a107193e
S
1323 subtitles = {}
1324 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1325 src = textstream.get('src')
d413095f 1326 if not src or src in urls:
a107193e 1327 continue
d413095f 1328 urls.append(src)
cafcf657 1329 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
03bc7237 1330 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1331 subtitles.setdefault(lang, []).append({
1332 'url': src,
1333 'ext': ext,
1334 })
1335 return subtitles
63757032 1336
942acef5
S
1337 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1338 xspf = self._download_xml(
8d6765cf 1339 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1340 'Unable to download xspf manifest', fatal=fatal)
1341 if xspf is False:
1342 return []
1343 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1344
942acef5 1345 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1346 NS_MAP = {
1347 'xspf': 'http://xspf.org/ns/0/',
1348 's1': 'http://static.streamone.nl/player/ns/0',
1349 }
1350
1351 entries = []
1352 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1353 title = xpath_text(
98044462 1354 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1355 description = xpath_text(
1356 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1357 thumbnail = xpath_text(
1358 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1359 duration = float_or_none(
1360 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1361
1362 formats = [{
1363 'url': location.text,
1364 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1365 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1366 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1367 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1368 self._sort_formats(formats)
1369
1370 entries.append({
1371 'id': playlist_id,
1372 'title': title,
1373 'description': description,
1374 'thumbnail': thumbnail,
1375 'duration': duration,
1376 'formats': formats,
1377 })
1378 return entries
1379
1bac3455 1380 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1381 res = self._download_webpage_handle(
1382 mpd_url, video_id,
1383 note=note or 'Downloading MPD manifest',
1384 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1385 fatal=fatal)
1bac3455 1386 if res is False:
2d2fa82d 1387 return []
1bac3455 1388 mpd, urlh = res
1389 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1390
91cb6b50 1391 return self._parse_mpd_formats(
1bac3455 1392 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
2d2fa82d 1393
91cb6b50 1394 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1bac3455 1395 if mpd_doc.get('type') == 'dynamic':
1396 return []
2d2fa82d 1397
91cb6b50 1398 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1399
1400 def _add_ns(path):
1401 return self._xpath_ns(path, namespace)
1402
675d0016 1403 def is_drm_protected(element):
1404 return element.find(_add_ns('ContentProtection')) is not None
1405
1bac3455 1406 def extract_multisegment_info(element, ms_parent_info):
1407 ms_info = ms_parent_info.copy()
f14be228 1408 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1409 if segment_list is not None:
f14be228 1410 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1411 if segment_urls_e:
1412 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
f14be228 1413 initialization = segment_list.find(_add_ns('Initialization'))
1bac3455 1414 if initialization is not None:
1415 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1416 else:
f14be228 1417 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1418 if segment_template is not None:
1419 start_number = segment_template.get('startNumber')
1420 if start_number:
1421 ms_info['start_number'] = int(start_number)
f14be228 1422 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1bac3455 1423 if segment_timeline is not None:
f14be228 1424 s_e = segment_timeline.findall(_add_ns('S'))
1bac3455 1425 if s_e:
1426 ms_info['total_number'] = 0
1427 for s in s_e:
1428 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1429 else:
1430 timescale = segment_template.get('timescale')
1431 if timescale:
1432 ms_info['timescale'] = int(timescale)
1433 segment_duration = segment_template.get('duration')
1434 if segment_duration:
1435 ms_info['segment_duration'] = int(segment_duration)
1436 media_template = segment_template.get('media')
1437 if media_template:
1438 ms_info['media_template'] = media_template
1439 initialization = segment_template.get('initialization')
1440 if initialization:
1441 ms_info['initialization_url'] = initialization
1442 else:
f14be228 1443 initialization = segment_template.find(_add_ns('Initialization'))
1bac3455 1444 if initialization is not None:
1445 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1446 return ms_info
b323e170 1447
1bac3455 1448 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1449 formats = []
f14be228 1450 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1451 period_duration = parse_duration(period.get('duration')) or mpd_duration
1452 period_ms_info = extract_multisegment_info(period, {
1453 'start_number': 1,
1454 'timescale': 1,
1455 })
f14be228 1456 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1457 if is_drm_protected(adaptation_set):
1458 continue
1bac3455 1459 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1460 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1461 if is_drm_protected(representation):
1462 continue
1bac3455 1463 representation_attrib = adaptation_set.attrib.copy()
1464 representation_attrib.update(representation.attrib)
a6c8b759
YCH
1465 # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1466 mime_type = representation_attrib['mimeType']
1467 content_type = mime_type.split('/')[0]
1bac3455 1468 if content_type == 'text':
1469 # TODO implement WebVTT downloading
1470 pass
1471 elif content_type == 'video' or content_type == 'audio':
1472 base_url = ''
1473 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1474 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1475 if base_url_e is not None:
1476 base_url = base_url_e.text + base_url
1477 if re.match(r'^https?://', base_url):
1478 break
bb20526b
S
1479 if mpd_base_url and not re.match(r'^https?://', base_url):
1480 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1481 mpd_base_url += '/'
1bac3455 1482 base_url = mpd_base_url + base_url
1483 representation_id = representation_attrib.get('id')
d577c796 1484 lang = representation_attrib.get('lang')
51e9094f 1485 url_el = representation.find(_add_ns('BaseURL'))
1486 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1bac3455 1487 f = {
154c209e 1488 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 1489 'url': base_url,
a6c8b759 1490 'ext': mimetype2ext(mime_type),
1bac3455 1491 'width': int_or_none(representation_attrib.get('width')),
1492 'height': int_or_none(representation_attrib.get('height')),
1493 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1494 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1495 'fps': int_or_none(representation_attrib.get('frameRate')),
1496 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1497 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
d577c796 1498 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 1499 'format_note': 'DASH %s' % content_type,
51e9094f 1500 'filesize': filesize,
1bac3455 1501 }
1502 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1503 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1504 if 'total_number' not in representation_ms_info and 'segment_duration':
6a3828fd 1505 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1506 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1bac3455 1507 media_template = representation_ms_info['media_template']
1508 media_template = media_template.replace('$RepresentationID$', representation_id)
53c269c6 1509 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1bac3455 1510 media_template.replace('$$', '$')
53c269c6 1511 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1bac3455 1512 if 'segment_urls' in representation_ms_info:
1513 f.update({
1514 'segment_urls': representation_ms_info['segment_urls'],
1515 'protocol': 'http_dash_segments',
df374b52 1516 })
1bac3455 1517 if 'initialization_url' in representation_ms_info:
1518 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1519 f.update({
1520 'initialization_url': initialization_url,
1521 })
1522 if not f.get('url'):
1523 f['url'] = initialization_url
1524 try:
1525 existing_format = next(
1526 fo for fo in formats
1527 if fo['format_id'] == representation_id)
1528 except StopIteration:
1529 full_info = formats_dict.get(representation_id, {}).copy()
1530 full_info.update(f)
1531 formats.append(full_info)
1532 else:
1533 existing_format.update(f)
17b598d3 1534 else:
1bac3455 1535 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
0826a0b5 1536 self._sort_formats(formats)
17b598d3
YCH
1537 return formats
1538
f4b1c7ad
PH
1539 def _live_title(self, name):
1540 """ Generate the title for a live video """
1541 now = datetime.datetime.now()
611c1dd9 1542 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
1543 return name + ' ' + now_str
1544
b14f3a4c
PH
1545 def _int(self, v, name, fatal=False, **kwargs):
1546 res = int_or_none(v, **kwargs)
1547 if 'get_attr' in kwargs:
1548 print(getattr(v, kwargs['get_attr']))
1549 if res is None:
1550 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1551 if fatal:
1552 raise ExtractorError(msg)
1553 else:
1554 self._downloader.report_warning(msg)
1555 return res
1556
1557 def _float(self, v, name, fatal=False, **kwargs):
1558 res = float_or_none(v, **kwargs)
1559 if res is None:
1560 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1561 if fatal:
1562 raise ExtractorError(msg)
1563 else:
1564 self._downloader.report_warning(msg)
1565 return res
1566
42939b61 1567 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1568 cookie = compat_cookiejar.Cookie(
1569 0, name, value, None, None, domain, None,
42939b61
JMF
1570 None, '/', True, False, expire_time, '', None, None, None)
1571 self._downloader.cookiejar.set_cookie(cookie)
1572
799207e8 1573 def _get_cookies(self, url):
1574 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 1575 req = sanitized_Request(url)
799207e8 1576 self._downloader.cookiejar.add_cookie_header(req)
1577 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1578
05900629
PH
1579 def get_testcases(self, include_onlymatching=False):
1580 t = getattr(self, '_TEST', None)
1581 if t:
1582 assert not hasattr(self, '_TESTS'), \
1583 '%s has _TEST and _TESTS' % type(self).__name__
1584 tests = [t]
1585 else:
1586 tests = getattr(self, '_TESTS', [])
1587 for t in tests:
1588 if not include_onlymatching and t.get('only_matching', False):
1589 continue
1590 t['name'] = type(self).__name__[:-len('IE')]
1591 yield t
1592
1593 def is_suitable(self, age_limit):
1594 """ Test whether the extractor is generally suitable for the given
1595 age limit (i.e. pornographic sites are not, all others usually are) """
1596
1597 any_restricted = False
1598 for tc in self.get_testcases(include_onlymatching=False):
1599 if 'playlist' in tc:
1600 tc = tc['playlist'][0]
1601 is_restricted = age_restricted(
1602 tc.get('info_dict', {}).get('age_limit'), age_limit)
1603 if not is_restricted:
1604 return True
1605 any_restricted = any_restricted or is_restricted
1606 return not any_restricted
1607
a504ced0 1608 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1609 if (self._downloader.params.get('writesubtitles', False) or
1610 self._downloader.params.get('listsubtitles')):
1611 return self._get_subtitles(*args, **kwargs)
1612 return {}
a504ced0
JMF
1613
1614 def _get_subtitles(self, *args, **kwargs):
611c1dd9 1615 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 1616
912e0b7e
YCH
1617 @staticmethod
1618 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1619 """ Merge subtitle items for one language. Items with duplicated URLs
1620 will be dropped. """
1621 list1_urls = set([item['url'] for item in subtitle_list1])
1622 ret = list(subtitle_list1)
1623 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1624 return ret
1625
1626 @classmethod
8c97f819 1627 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 1628 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
1629 ret = dict(subtitle_dict1)
1630 for lang in subtitle_dict2:
8c97f819 1631 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
1632 return ret
1633
360e1ca5 1634 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1635 if (self._downloader.params.get('writeautomaticsub', False) or
1636 self._downloader.params.get('listsubtitles')):
1637 return self._get_automatic_captions(*args, **kwargs)
1638 return {}
360e1ca5
JMF
1639
1640 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 1641 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 1642
d77ab8e2
S
1643 def mark_watched(self, *args, **kwargs):
1644 if (self._downloader.params.get('mark_watched', False) and
1645 (self._get_login_info()[0] is not None or
1646 self._downloader.params.get('cookiefile') is not None)):
1647 self._mark_watched(*args, **kwargs)
1648
1649 def _mark_watched(self, *args, **kwargs):
1650 raise NotImplementedError('This method must be implemented by subclasses')
1651
8dbe9899 1652
d6983cb4
PH
1653class SearchInfoExtractor(InfoExtractor):
1654 """
1655 Base class for paged search queries extractors.
10952eb2 1656 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1657 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1658 """
1659
1660 @classmethod
1661 def _make_valid_url(cls):
1662 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1663
1664 @classmethod
1665 def suitable(cls, url):
1666 return re.match(cls._make_valid_url(), url) is not None
1667
1668 def _real_extract(self, query):
1669 mobj = re.match(self._make_valid_url(), query)
1670 if mobj is None:
f1a9d64e 1671 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1672
1673 prefix = mobj.group('prefix')
1674 query = mobj.group('query')
1675 if prefix == '':
1676 return self._get_n_results(query, 1)
1677 elif prefix == 'all':
1678 return self._get_n_results(query, self._MAX_RESULTS)
1679 else:
1680 n = int(prefix)
1681 if n <= 0:
f1a9d64e 1682 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1683 elif n > self._MAX_RESULTS:
f1a9d64e 1684 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1685 n = self._MAX_RESULTS
1686 return self._get_n_results(query, n)
1687
1688 def _get_n_results(self, query, n):
1689 """Get a specified number of results for a query"""
611c1dd9 1690 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
1691
1692 @property
1693 def SEARCH_KEY(self):
1694 return self._SEARCH_KEY