]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Add --mark-watched feature (Closes #5054)
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
6a3828fd 1from __future__ import unicode_literals
f1a9d64e 2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
1bac3455 13import math
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
799207e8 17 compat_cookies,
e64b7569 18 compat_getpass,
d6983cb4
PH
19 compat_http_client,
20 compat_urllib_error,
a107193e 21 compat_urllib_parse,
f0b5d6af 22 compat_urlparse,
d6983cb4 23 compat_str,
36e6f62c 24 compat_etree_fromstring,
8c25f81b
PH
25)
26from ..utils import (
c342041f 27 NO_DEFAULT,
05900629 28 age_restricted,
08f2a92c 29 bug_reports_message,
d6983cb4
PH
30 clean_html,
31 compiled_regex_type,
70f0f5a8 32 determine_ext,
9b9c5355 33 error_to_compat_str,
d6983cb4 34 ExtractorError,
97f4aecf 35 fix_xml_ampersands,
b14f3a4c 36 float_or_none,
31bb8d3f 37 int_or_none,
4ca2a3cf 38 parse_iso8601,
55b3e45b 39 RegexNotFoundError,
d41e6efc 40 sanitize_filename,
5c2266df 41 sanitized_Request,
f38de77f 42 unescapeHTML,
647eab45 43 unified_strdate,
a107193e 44 url_basename,
8d6765cf
S
45 xpath_text,
46 xpath_with_ns,
d497a201 47 determine_protocol,
1bac3455 48 parse_duration,
cafcf657 49 mimetype2ext,
d6983cb4 50)
c342041f 51
d6983cb4
PH
52
53class InfoExtractor(object):
54 """Information Extractor class.
55
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
5d380852 60 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
63
cf0649f8 64 The type field determines the type of the result.
fed5d032
PH
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
67
68 For a video, the dictionaries must include the following fields:
d6983cb4
PH
69
70 id: Video identifier.
d6983cb4 71 title: Video title, unescaped.
d67b0b15 72
f49d89ee 73 Additionally, it must contain either a formats entry or a url one:
d67b0b15 74
f49d89ee
PH
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
77
78 Potential fields:
d67b0b15 79 * url Mandatory. The URL of the video file
10952eb2 80 * ext Will be calculated from URL if missing
d67b0b15
PH
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
5d4f3985
PH
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
d67b0b15
PH
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
f49d89ee 92 * resolution Textual description of width and height
7217e148 93 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
dd27fd17 96 * asr Audio sampling rate in Hertz
d67b0b15 97 * vbr Average video bitrate in KBit/s
fbb21cf5 98 * fps Frame rate
d67b0b15 99 * vcodec Name of the video codec in use
1394ce65 100 * container Name of the container format
d67b0b15 101 * filesize The number of bytes, if known in advance
9732d77e 102 * filesize_approx An estimate for the number of bytes
d67b0b15 103 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
b04b8852
PH
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
f49d89ee 108 * preference Order number of this format. If this field is
08d13955 109 present and not None, the formats get sorted
38d63d84 110 by this field, regardless of all other values.
f49d89ee
PH
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
e65566a9
PH
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
32f90364
PH
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
117 the URL?
aff2f4f4
PH
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
5d73273f
PH
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
c64ed2a3
PH
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
d769be6c
PH
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
6271f1ca 131 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
136
c0ba0f48 137 url: Final video URL.
d6983cb4 138 ext: Video filename extension.
d67b0b15
PH
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 141
d6983cb4
PH
142 The following fields are optional:
143
f5e43bc6 144 alt_title: A secondary title of the video.
0afef30b
PH
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
d5519808 149 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 150 * "id" (optional, string) - Thumbnail format ID
d5519808 151 * "url"
cfb56d1a 152 * "preference" (optional, int) - quality of the image
d5519808
PH
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
156 deprecated)
d6983cb4 157 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 158 description: Full video description.
d6983cb4 159 uploader: Full name of the video uploader.
9bb8e0a3 160 creator: The main artist who created the video.
8aab976b 161 release_date: The date (YYYYMMDD) when the video was released.
955c4514 162 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 163 upload_date: Video upload date (YYYYMMDD).
955c4514 164 If not explicitly set, calculated from timestamp.
d6983cb4 165 uploader_id: Nickname or id of the video uploader.
da9ec3b9 166 location: Physical location where the video was filmed.
a504ced0
JMF
167 subtitles: The available subtitles as a dictionary in the format
168 {language: subformats}. "subformats" is a list sorted from
169 lower to higher preference, each element is a dictionary
170 with the "ext" entry and one of:
171 * "data": The subtitles file contents
10952eb2 172 * "url": A URL pointing to the subtitles file
4bba3716 173 "ext" will be calculated from URL if missing
360e1ca5
JMF
174 automatic_captions: Like 'subtitles', used by the YoutubeIE for
175 automatically generated captions
62d231c0 176 duration: Length of the video in seconds, as an integer or float.
f3d29461 177 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
178 like_count: Number of positive ratings of the video
179 dislike_count: Number of negative ratings of the video
02835c6b 180 repost_count: Number of reposts of the video
2d30521a 181 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 182 comment_count: Number of comments on the video
dd622d7c
PH
183 comments: A list of comments, each with one or more of the following
184 properties (all but one of text or html optional):
185 * "author" - human-readable name of the comment author
186 * "author_id" - user ID of the comment author
187 * "id" - Comment ID
188 * "html" - Comment as HTML
189 * "text" - Plain text of the comment
190 * "timestamp" - UNIX timestamp of comment
191 * "parent" - ID of the comment this one is replying to.
192 Set to "root" to indicate that this is a
193 comment to the original video.
8dbe9899 194 age_limit: Age restriction for the video, as an integer (years)
10952eb2 195 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
196 should allow to get the same result again. (It will be set
197 by YoutubeDL if it's missing)
ad3bc6ac
PH
198 categories: A list of categories that the video falls in, for example
199 ["Sports", "Berlin"]
864f24bd 200 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
201 is_live: True, False, or None (=unknown). Whether this video is a
202 live stream that goes on instead of a fixed-length video.
7c80519c 203 start_time: Time in seconds where the reproduction should start, as
10952eb2 204 specified in the URL.
297a564b 205 end_time: Time in seconds where the reproduction should end, as
10952eb2 206 specified in the URL.
d6983cb4 207
7109903e
S
208 The following fields should only be used when the video belongs to some logical
209 chapter or section:
210
211 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
212 chapter_number: Number of the chapter the video belongs to, as an integer.
213 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
214
215 The following fields should only be used when the video is an episode of some
216 series or programme:
217
218 series: Title of the series or programme the video episode belongs to.
219 season: Title of the season the video episode belongs to.
27bfd4e5
S
220 season_number: Number of the season the video episode belongs to, as an integer.
221 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
222 episode: Title of the video episode. Unlike mandatory video title field,
223 this field should denote the exact title of the video episode
224 without any kind of decoration.
27bfd4e5
S
225 episode_number: Number of the video episode within a season, as an integer.
226 episode_id: Id of the video episode, as a unicode string.
7109903e 227
deefc05b 228 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 229
d838b1bd
PH
230 Unless mentioned otherwise, None is equivalent to absence of information.
231
fed5d032
PH
232
233 _type "playlist" indicates multiple videos.
b82f815f
PH
234 There must be a key "entries", which is a list, an iterable, or a PagedList
235 object, each element of which is a valid dictionary by this specification.
fed5d032 236
e0b9d78f
S
237 Additionally, playlists can have "title", "description" and "id" attributes
238 with the same semantics as videos (see above).
fed5d032
PH
239
240
241 _type "multi_video" indicates that there are multiple videos that
242 form a single show, for examples multiple acts of an opera or TV episode.
243 It must have an entries key like a playlist and contain all the keys
244 required for a video at the same time.
245
246
247 _type "url" indicates that the video must be extracted from another
248 location, possibly by a different extractor. Its only required key is:
249 "url" - the next URL to extract.
f58766ce
PH
250 The key "ie_key" can be set to the class name (minus the trailing "IE",
251 e.g. "Youtube") if the extractor class is known in advance.
252 Additionally, the dictionary may have any properties of the resolved entity
253 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
254 known ahead of time.
255
256
257 _type "url_transparent" entities have the same specification as "url", but
258 indicate that the given additional information is more precise than the one
259 associated with the resolved URL.
260 This is useful when a site employs a video service that hosts the video and
261 its technical metadata, but that video service does not embed a useful
262 title, description etc.
263
264
d6983cb4
PH
265 Subclasses of this one should re-define the _real_initialize() and
266 _real_extract() methods and define a _VALID_URL regexp.
267 Probably, they should also be added to the list of extractors.
268
d6983cb4
PH
269 Finally, the _WORKING attribute should be set to False for broken IEs
270 in order to warn the users and skip the tests.
271 """
272
273 _ready = False
274 _downloader = None
275 _WORKING = True
276
277 def __init__(self, downloader=None):
278 """Constructor. Receives an optional downloader."""
279 self._ready = False
280 self.set_downloader(downloader)
281
282 @classmethod
283 def suitable(cls, url):
284 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
285
286 # This does not use has/getattr intentionally - we want to know whether
287 # we have cached the regexp for *this* class, whereas getattr would also
288 # match the superclass
289 if '_VALID_URL_RE' not in cls.__dict__:
290 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
291 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 292
ed9266db
PH
293 @classmethod
294 def _match_id(cls, url):
295 if '_VALID_URL_RE' not in cls.__dict__:
296 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
297 m = cls._VALID_URL_RE.match(url)
298 assert m
299 return m.group('id')
300
d6983cb4
PH
301 @classmethod
302 def working(cls):
303 """Getter method for _WORKING."""
304 return cls._WORKING
305
306 def initialize(self):
307 """Initializes an instance (authentication, etc)."""
308 if not self._ready:
309 self._real_initialize()
310 self._ready = True
311
312 def extract(self, url):
313 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
314 try:
315 self.initialize()
316 return self._real_extract(url)
317 except ExtractorError:
318 raise
319 except compat_http_client.IncompleteRead as e:
dfb1b146 320 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 321 except (KeyError, StopIteration) as e:
dfb1b146 322 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4
PH
323
324 def set_downloader(self, downloader):
325 """Sets the downloader for this IE."""
326 self._downloader = downloader
327
328 def _real_initialize(self):
329 """Real initialization process. Redefine in subclasses."""
330 pass
331
332 def _real_extract(self, url):
333 """Real extraction process. Redefine in subclasses."""
334 pass
335
56c73665
JMF
336 @classmethod
337 def ie_key(cls):
338 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 339 return compat_str(cls.__name__[:-2])
56c73665 340
d6983cb4
PH
341 @property
342 def IE_NAME(self):
dc519b54 343 return compat_str(type(self).__name__[:-2])
d6983cb4 344
7cc3570e 345 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
346 """ Returns the response handle """
347 if note is None:
348 self.report_download_webpage(video_id)
349 elif note is not False:
7cc3570e 350 if video_id is None:
f1a9d64e 351 self.to_screen('%s' % (note,))
7cc3570e 352 else:
f1a9d64e 353 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 354 try:
dca08720 355 return self._downloader.urlopen(url_or_request)
d6983cb4 356 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
357 if errnote is False:
358 return False
d6983cb4 359 if errnote is None:
f1a9d64e 360 errnote = 'Unable to download webpage'
7f8b2714 361
9b9c5355 362 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
363 if fatal:
364 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
365 else:
366 self._downloader.report_warning(errmsg)
367 return False
d6983cb4 368
c9a77969 369 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
d6983cb4 370 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
371 # Strip hashes from the URL (#1038)
372 if isinstance(url_or_request, (compat_str, str)):
373 url_or_request = url_or_request.partition('#')[0]
374
7cc3570e
PH
375 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
376 if urlh is False:
377 assert not fatal
378 return False
c9a77969 379 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
380 return (content, urlh)
381
c9a77969
YCH
382 @staticmethod
383 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
384 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
385 if m:
386 encoding = m.group(1)
387 else:
0d75ae2c 388 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
389 webpage_bytes[:1024])
390 if m:
391 encoding = m.group(1).decode('ascii')
b60016e8
PH
392 elif webpage_bytes.startswith(b'\xff\xfe'):
393 encoding = 'utf-16'
f143d86a
PH
394 else:
395 encoding = 'utf-8'
c9a77969
YCH
396
397 return encoding
398
399 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
400 content_type = urlh.headers.get('Content-Type', '')
401 webpage_bytes = urlh.read()
402 if prefix is not None:
403 webpage_bytes = prefix + webpage_bytes
404 if not encoding:
405 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
406 if self._downloader.params.get('dump_intermediate_pages', False):
407 try:
408 url = url_or_request.get_full_url()
409 except AttributeError:
410 url = url_or_request
f1a9d64e 411 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
412 dump = base64.b64encode(webpage_bytes).decode('ascii')
413 self._downloader.to_screen(dump)
d41e6efc
PH
414 if self._downloader.params.get('write_pages', False):
415 try:
416 url = url_or_request.get_full_url()
417 except AttributeError:
418 url = url_or_request
5afa7f8b 419 basen = '%s_%s' % (video_id, url)
c1bce22f 420 if len(basen) > 240:
f1a9d64e 421 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
422 basen = basen[:240 - len(h)] + h
423 raw_filename = basen + '.dump'
d41e6efc 424 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 425 self.to_screen('Saving request to ' + filename)
5f58165d
S
426 # Working around MAX_PATH limitation on Windows (see
427 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
428 if os.name == 'nt':
429 absfilepath = os.path.abspath(filename)
430 if len(absfilepath) > 259:
431 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
432 with open(filename, 'wb') as outf:
433 outf.write(webpage_bytes)
434
ec0fafbb
AA
435 try:
436 content = webpage_bytes.decode(encoding, 'replace')
437 except LookupError:
438 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 439
f1a9d64e
PH
440 if ('<title>Access to this site is blocked</title>' in content and
441 'Websense' in content[:512]):
442 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
443 blocked_iframe = self._html_search_regex(
444 r'<iframe src="([^"]+)"', content,
f1a9d64e 445 'Websense information URL', default=None)
2410c43d 446 if blocked_iframe:
f1a9d64e 447 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 448 raise ExtractorError(msg, expected=True)
77b2986b
PH
449 if '<title>The URL you requested has been blocked</title>' in content[:512]:
450 msg = (
451 'Access to this webpage has been blocked by Indian censorship. '
452 'Use a VPN or proxy server (with --proxy) to route around it.')
453 block_msg = self._html_search_regex(
454 r'</h1><p>(.*?)</p>',
455 content, 'block message', default=None)
456 if block_msg:
457 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
458 raise ExtractorError(msg, expected=True)
2410c43d 459
23be51d8 460 return content
d6983cb4 461
c9a77969 462 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
d6983cb4 463 """ Returns the data of the page as a string """
995ad69c
TF
464 success = False
465 try_count = 0
466 while success is False:
467 try:
c9a77969 468 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
995ad69c
TF
469 success = True
470 except compat_http_client.IncompleteRead as e:
471 try_count += 1
472 if try_count >= tries:
473 raise e
474 self._sleep(timeout, video_id)
7cc3570e
PH
475 if res is False:
476 return res
477 else:
478 content, _ = res
479 return content
d6983cb4 480
2a275ab0 481 def _download_xml(self, url_or_request, video_id,
f1a9d64e 482 note='Downloading XML', errnote='Unable to download XML',
c9a77969 483 transform_source=None, fatal=True, encoding=None):
267ed0c5 484 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 485 xml_string = self._download_webpage(
c9a77969 486 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
28746fbd
PH
487 if xml_string is False:
488 return xml_string
e2b38da9
PH
489 if transform_source:
490 xml_string = transform_source(xml_string)
36e6f62c 491 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 492
3d3538e4 493 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
494 note='Downloading JSON metadata',
495 errnote='Unable to download JSON metadata',
b090af59 496 transform_source=None,
c9a77969 497 fatal=True, encoding=None):
b090af59 498 json_string = self._download_webpage(
c9a77969
YCH
499 url_or_request, video_id, note, errnote, fatal=fatal,
500 encoding=encoding)
b090af59
PH
501 if (not fatal) and json_string is False:
502 return None
ebb64199
TF
503 return self._parse_json(
504 json_string, video_id, transform_source=transform_source, fatal=fatal)
505
506 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
507 if transform_source:
508 json_string = transform_source(json_string)
3d3538e4
PH
509 try:
510 return json.loads(json_string)
511 except ValueError as ve:
e7b6d122
PH
512 errmsg = '%s: Failed to parse JSON ' % video_id
513 if fatal:
514 raise ExtractorError(errmsg, cause=ve)
515 else:
516 self.report_warning(errmsg + str(ve))
3d3538e4 517
f45f96f8 518 def report_warning(self, msg, video_id=None):
f1a9d64e 519 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 520 self._downloader.report_warning(
f1a9d64e 521 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 522
d6983cb4
PH
523 def to_screen(self, msg):
524 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 525 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
526
527 def report_extraction(self, id_or_name):
528 """Report information extraction."""
f1a9d64e 529 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
530
531 def report_download_webpage(self, video_id):
532 """Report webpage download."""
f1a9d64e 533 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
534
535 def report_age_confirmation(self):
536 """Report attempt to confirm age."""
f1a9d64e 537 self.to_screen('Confirming age')
d6983cb4 538
fc79158d
JMF
539 def report_login(self):
540 """Report attempt to log in."""
f1a9d64e 541 self.to_screen('Logging in')
fc79158d 542
43e7d3c9
S
543 @staticmethod
544 def raise_login_required(msg='This video is only available for registered users'):
545 raise ExtractorError(
546 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
547 expected=True)
548
c430802e
S
549 @staticmethod
550 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
551 raise ExtractorError(
552 '%s. You might want to use --proxy to workaround.' % msg,
553 expected=True)
554
5f6a1245 555 # Methods for following #608
c0d0b01f 556 @staticmethod
830d53bf 557 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 558 """Returns a URL that points to a page that should be processed"""
5f6a1245 559 # TODO: ie should be the class used for getting the info
d6983cb4
PH
560 video_info = {'_type': 'url',
561 'url': url,
562 'ie_key': ie}
7012b23c
PH
563 if video_id is not None:
564 video_info['id'] = video_id
830d53bf
S
565 if video_title is not None:
566 video_info['title'] = video_title
d6983cb4 567 return video_info
5f6a1245 568
c0d0b01f 569 @staticmethod
acf5cbfe 570 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
571 """Returns a playlist"""
572 video_info = {'_type': 'playlist',
573 'entries': entries}
574 if playlist_id:
575 video_info['id'] = playlist_id
576 if playlist_title:
577 video_info['title'] = playlist_title
acf5cbfe
S
578 if playlist_description:
579 video_info['description'] = playlist_description
d6983cb4
PH
580 return video_info
581
c342041f 582 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
583 """
584 Perform a regex search on the given string, using a single or a list of
585 patterns returning the first matching group.
586 In case of failure return a default value or raise a WARNING or a
55b3e45b 587 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
588 """
589 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
590 mobj = re.search(pattern, string, flags)
591 else:
592 for p in pattern:
593 mobj = re.search(p, string, flags)
c3415d1b
PH
594 if mobj:
595 break
d6983cb4 596
7e5db8c9 597 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 598 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
599 else:
600 _name = name
601
602 if mobj:
711ede6e
PH
603 if group is None:
604 # return the first matching group
605 return next(g for g in mobj.groups() if g is not None)
606 else:
607 return mobj.group(group)
c342041f 608 elif default is not NO_DEFAULT:
d6983cb4
PH
609 return default
610 elif fatal:
f1a9d64e 611 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 612 else:
08f2a92c 613 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
614 return None
615
c342041f 616 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
617 """
618 Like _search_regex, but strips HTML tags and unescapes entities.
619 """
711ede6e 620 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
621 if res:
622 return clean_html(res).strip()
623 else:
624 return res
625
fc79158d
JMF
626 def _get_login_info(self):
627 """
cf0649f8 628 Get the login info as (username, password)
fc79158d
JMF
629 It will look in the netrc file using the _NETRC_MACHINE value
630 If there's no info available, return (None, None)
631 """
632 if self._downloader is None:
633 return (None, None)
634
635 username = None
636 password = None
637 downloader_params = self._downloader.params
638
639 # Attempt to use provided username and password or .netrc data
d800609c 640 if downloader_params.get('username') is not None:
fc79158d
JMF
641 username = downloader_params['username']
642 password = downloader_params['password']
643 elif downloader_params.get('usenetrc', False):
644 try:
645 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
646 if info is not None:
647 username = info[0]
648 password = info[2]
649 else:
650 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
651 except (IOError, netrc.NetrcParseError) as err:
9b9c5355 652 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
5f6a1245 653
fc79158d
JMF
654 return (username, password)
655
e64b7569 656 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 657 """
658 Get the two-factor authentication info
659 TODO - asking the user will be required for sms/phone verify
660 currently just uses the command line option
661 If there's no info available, return None
662 """
663 if self._downloader is None:
83317f69 664 return None
665 downloader_params = self._downloader.params
666
d800609c 667 if downloader_params.get('twofactor') is not None:
83317f69 668 return downloader_params['twofactor']
669
e64b7569 670 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 671
46720279
JMF
672 # Helper functions for extracting OpenGraph info
673 @staticmethod
ab2d5247 674 def _og_regexes(prop):
448ef1f3 675 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
676 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
677 % {'prop': re.escape(prop)})
78fb87b2 678 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 679 return [
78fb87b2
JMF
680 template % (property_re, content_re),
681 template % (content_re, property_re),
ab2d5247 682 ]
46720279 683
864f24bd
S
684 @staticmethod
685 def _meta_regex(prop):
686 return r'''(?isx)<meta
8b9848ac 687 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
688 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
689
3c4e6d83 690 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 691 if name is None:
3c4e6d83 692 name = 'OpenGraph %s' % prop
ab2d5247 693 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
694 if escaped is None:
695 return None
696 return unescapeHTML(escaped)
46720279
JMF
697
698 def _og_search_thumbnail(self, html, **kargs):
10952eb2 699 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
700
701 def _og_search_description(self, html, **kargs):
702 return self._og_search_property('description', html, fatal=False, **kargs)
703
704 def _og_search_title(self, html, **kargs):
705 return self._og_search_property('title', html, **kargs)
706
8ffa13e0 707 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
708 regexes = self._og_regexes('video') + self._og_regexes('video:url')
709 if secure:
710 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 711 return self._html_search_regex(regexes, html, name, **kargs)
46720279 712
78338f71
JMF
713 def _og_search_url(self, html, **kargs):
714 return self._og_search_property('url', html, **kargs)
715
40c696e5 716 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
717 if display_name is None:
718 display_name = name
719 return self._html_search_regex(
864f24bd 720 self._meta_regex(name),
711ede6e 721 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
722
723 def _dc_search_uploader(self, html):
724 return self._html_search_meta('dc.creator', html, 'uploader')
725
8dbe9899
PH
726 def _rta_search(self, html):
727 # See http://www.rtalabel.org/index.php?content=howtofaq#single
728 if re.search(r'(?ix)<meta\s+name="rating"\s+'
729 r' content="RTA-5042-1996-1400-1577-RTA"',
730 html):
731 return 18
732 return 0
733
59040888
PH
734 def _media_rating_search(self, html):
735 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
736 rating = self._html_search_meta('rating', html)
737
738 if not rating:
739 return None
740
741 RATING_TABLE = {
742 'safe for kids': 0,
743 'general': 8,
744 '14 years': 14,
745 'mature': 17,
746 'restricted': 19,
747 }
d800609c 748 return RATING_TABLE.get(rating.lower())
59040888 749
69319969 750 def _family_friendly_search(self, html):
6ca7732d 751 # See http://schema.org/VideoObject
69319969
NJ
752 family_friendly = self._html_search_meta('isFamilyFriendly', html)
753
754 if not family_friendly:
755 return None
756
757 RATING_TABLE = {
758 '1': 0,
759 'true': 0,
760 '0': 18,
761 'false': 18,
762 }
d800609c 763 return RATING_TABLE.get(family_friendly.lower())
69319969 764
0c708f11
JMF
765 def _twitter_search_player(self, html):
766 return self._html_search_meta('twitter:player', html,
9e1a5b84 767 'twitter card player')
0c708f11 768
0b26ba3f 769 def _search_json_ld(self, html, video_id, **kwargs):
4ca2a3cf
S
770 json_ld = self._search_regex(
771 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 772 html, 'JSON-LD', group='json_ld', **kwargs)
4ca2a3cf
S
773 if not json_ld:
774 return {}
0b26ba3f 775 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
4ca2a3cf
S
776
777 def _json_ld(self, json_ld, video_id, fatal=True):
778 if isinstance(json_ld, compat_str):
779 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
780 if not json_ld:
781 return {}
782 info = {}
783 if json_ld.get('@context') == 'http://schema.org':
784 item_type = json_ld.get('@type')
785 if item_type == 'TVEpisode':
786 info.update({
787 'episode': unescapeHTML(json_ld.get('name')),
788 'episode_number': int_or_none(json_ld.get('episodeNumber')),
789 'description': unescapeHTML(json_ld.get('description')),
790 })
791 part_of_season = json_ld.get('partOfSeason')
792 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
793 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
794 part_of_series = json_ld.get('partOfSeries')
795 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
796 info['series'] = unescapeHTML(part_of_series.get('name'))
797 elif item_type == 'Article':
798 info.update({
799 'timestamp': parse_iso8601(json_ld.get('datePublished')),
800 'title': unescapeHTML(json_ld.get('headline')),
801 'description': unescapeHTML(json_ld.get('articleBody')),
802 })
803 return dict((k, v) for k, v in info.items() if v is not None)
804
27713812 805 @staticmethod
f8da79f8 806 def _hidden_inputs(html):
586f1cc5 807 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 808 hidden_inputs = {}
73eb13df 809 for input in re.findall(r'(?i)<input([^>]+)>', html):
be0e5dbd 810 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
201ea3ee
S
811 continue
812 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
813 if not name:
814 continue
815 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
816 if not value:
817 continue
818 hidden_inputs[name.group('value')] = value.group('value')
819 return hidden_inputs
27713812 820
cf61d96d
S
821 def _form_hidden_inputs(self, form_id, html):
822 form = self._search_regex(
73eb13df 823 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
824 html, '%s form' % form_id, group='form')
825 return self._hidden_inputs(form)
826
3ded7bac 827 def _sort_formats(self, formats, field_preference=None):
7e8caf30 828 if not formats:
f1a9d64e 829 raise ExtractorError('No video formats found')
7e8caf30 830
b0d21ded
S
831 for f in formats:
832 # Automatically determine tbr when missing based on abr and vbr (improves
833 # formats sorting in some cases)
350cf045 834 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
835 f['tbr'] = f['abr'] + f['vbr']
836
4bcc7bd1 837 def _formats_key(f):
e6812ac9
PH
838 # TODO remove the following workaround
839 from ..utils import determine_ext
840 if not f.get('ext') and 'url' in f:
841 f['ext'] = determine_ext(f['url'])
842
3ded7bac
S
843 if isinstance(field_preference, (list, tuple)):
844 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
845
4bcc7bd1
PH
846 preference = f.get('preference')
847 if preference is None:
d497a201 848 preference = 0
4bcc7bd1
PH
849 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
850 preference -= 0.5
851
d497a201 852 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
853
4bcc7bd1
PH
854 if f.get('vcodec') == 'none': # audio only
855 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 856 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 857 else:
f1a9d64e 858 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
859 ext_preference = 0
860 try:
861 audio_ext_preference = ORDER.index(f['ext'])
862 except ValueError:
863 audio_ext_preference = -1
864 else:
865 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 866 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 867 else:
f1a9d64e 868 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
869 try:
870 ext_preference = ORDER.index(f['ext'])
871 except ValueError:
872 ext_preference = -1
873 audio_ext_preference = 0
874
875 return (
876 preference,
aff2f4f4 877 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 878 f.get('quality') if f.get('quality') is not None else -1,
9933b574 879 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 880 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 881 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
882 f.get('height') if f.get('height') is not None else -1,
883 f.get('width') if f.get('width') is not None else -1,
d497a201 884 proto_preference,
1e1896f2 885 ext_preference,
4bcc7bd1
PH
886 f.get('abr') if f.get('abr') is not None else -1,
887 audio_ext_preference,
2c8e03d9 888 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 889 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 890 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 891 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
892 )
893 formats.sort(key=_formats_key)
59040888 894
96a53167
S
895 def _check_formats(self, formats, video_id):
896 if formats:
897 formats[:] = filter(
898 lambda f: self._is_valid_url(
899 f['url'], video_id,
900 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
901 formats)
902
f5bdb444
S
903 @staticmethod
904 def _remove_duplicate_formats(formats):
905 format_urls = set()
906 unique_formats = []
907 for f in formats:
908 if f['url'] not in format_urls:
909 format_urls.add(f['url'])
910 unique_formats.append(f)
911 formats[:] = unique_formats
912
96a53167 913 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
914 url = self._proto_relative_url(url, scheme='http:')
915 # For now assume non HTTP(S) URLs always valid
916 if not (url.startswith('http://') or url.startswith('https://')):
917 return True
96a53167 918 try:
4069766c 919 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
920 return True
921 except ExtractorError as e:
943a1e24 922 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
923 self.to_screen(
924 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
925 return False
926 raise
927
20991253 928 def http_scheme(self):
1ede5b24 929 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
930 return (
931 'http:'
932 if self._downloader.params.get('prefer_insecure', False)
933 else 'https:')
934
57c7411f
PH
935 def _proto_relative_url(self, url, scheme=None):
936 if url is None:
937 return url
938 if url.startswith('//'):
939 if scheme is None:
940 scheme = self.http_scheme()
941 return scheme + url
942 else:
943 return url
944
4094b6e3
PH
945 def _sleep(self, timeout, video_id, msg_template=None):
946 if msg_template is None:
f1a9d64e 947 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
948 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
949 self.to_screen(msg)
950 time.sleep(timeout)
951
a38436e8 952 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310
S
953 transform_source=lambda s: fix_xml_ampersands(s).strip(),
954 fatal=True):
f036a632
JMF
955 manifest = self._download_xml(
956 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
957 'Unable to download f4m manifest',
958 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
959 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
960 transform_source=transform_source,
961 fatal=fatal)
962
963 if manifest is False:
8d29e47f 964 return []
31bb8d3f
JMF
965
966 formats = []
7a47d07c 967 manifest_version = '1.0'
b2527359 968 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 969 if not media_nodes:
7a47d07c 970 manifest_version = '2.0'
34e48bed 971 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
019839fa
S
972 base_url = xpath_text(
973 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
974 'base URL', default=None)
975 if base_url:
976 base_url = base_url.strip()
b2527359 977 for i, media_el in enumerate(media_nodes):
7a47d07c 978 if manifest_version == '2.0':
31c746e5
S
979 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
980 if not media_url:
981 continue
cc357c4d
S
982 manifest_url = (
983 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 984 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
985 # If media_url is itself a f4m manifest do the recursive extraction
986 # since bitrates in parent manifest (this one) and media_url manifest
987 # may differ leading to inability to resolve the format by requested
988 # bitrate in f4m downloader
989 if determine_ext(manifest_url) == 'f4m':
7e5edcfd
S
990 formats.extend(self._extract_f4m_formats(
991 manifest_url, video_id, preference, f4m_id, fatal=fatal))
70f0f5a8 992 continue
b2527359 993 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 994 formats.append({
e21a55ab 995 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
996 'url': manifest_url,
997 'ext': 'flv',
b2527359 998 'tbr': tbr,
31bb8d3f
JMF
999 'width': int_or_none(media_el.attrib.get('width')),
1000 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 1001 'preference': preference,
31bb8d3f
JMF
1002 })
1003 self._sort_formats(formats)
1004
1005 return formats
1006
f0b5d6af 1007 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 1008 entry_protocol='m3u8', preference=None,
13af92fd
YCH
1009 m3u8_id=None, note=None, errnote=None,
1010 fatal=True):
f0b5d6af 1011
704df56d 1012 formats = [{
f207019c 1013 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1014 'url': m3u8_url,
1015 'ext': ext,
1016 'protocol': 'm3u8',
9fe6ef7a 1017 'preference': preference - 1 if preference else -1,
704df56d
PH
1018 'resolution': 'multiple',
1019 'format_note': 'Quality selection URL',
1020 }]
1021
f0b5d6af
PH
1022 format_url = lambda u: (
1023 u
1024 if re.match(r'^https?://', u)
1025 else compat_urlparse.urljoin(m3u8_url, u))
1026
dbd82a1d 1027 res = self._download_webpage_handle(
81515ad9 1028 m3u8_url, video_id,
621ed9f5 1029 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1030 errnote=errnote or 'Failed to download m3u8 information',
1031 fatal=fatal)
dbd82a1d 1032 if res is False:
8d29e47f 1033 return []
dbd82a1d 1034 m3u8_doc, urlh = res
37113045 1035 m3u8_url = urlh.geturl()
9cdffeeb
S
1036
1037 # We should try extracting formats only from master playlists [1], i.e.
1038 # playlists that describe available qualities. On the other hand media
1039 # playlists [2] should be returned as is since they contain just the media
1040 # without qualities renditions.
1041 # Fortunately, master playlist can be easily distinguished from media
1042 # playlist based on particular tags availability. As of [1, 2] master
1043 # playlist tags MUST NOT appear in a media playist and vice versa.
1044 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1045 # and MUST NOT appear in master playlist thus we can clearly detect media
1046 # playlist with this criterion.
1047 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1048 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1049 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1050 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1051 return [{
1052 'url': m3u8_url,
1053 'format_id': m3u8_id,
1054 'ext': ext,
1055 'protocol': entry_protocol,
1056 'preference': preference,
1057 }]
704df56d 1058 last_info = None
fa156077 1059 last_media = None
704df56d
PH
1060 kv_rex = re.compile(
1061 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1062 for line in m3u8_doc.splitlines():
1063 if line.startswith('#EXT-X-STREAM-INF:'):
1064 last_info = {}
1065 for m in kv_rex.finditer(line):
1066 v = m.group('val')
1067 if v.startswith('"'):
1068 v = v[1:-1]
1069 last_info[m.group('key')] = v
4cd95bcb
JMF
1070 elif line.startswith('#EXT-X-MEDIA:'):
1071 last_media = {}
1072 for m in kv_rex.finditer(line):
1073 v = m.group('val')
1074 if v.startswith('"'):
1075 v = v[1:-1]
1076 last_media[m.group('key')] = v
704df56d
PH
1077 elif line.startswith('#') or not line.strip():
1078 continue
1079 else:
daebaab6 1080 if last_info is None:
f0b5d6af 1081 formats.append({'url': format_url(line)})
3524cc25 1082 continue
704df56d 1083 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1084 format_id = []
1085 if m3u8_id:
1086 format_id.append(m3u8_id)
05d5392c 1087 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 1088 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 1089 f = {
8dc9d361 1090 'format_id': '-'.join(format_id),
f0b5d6af 1091 'url': format_url(line.strip()),
704df56d
PH
1092 'tbr': tbr,
1093 'ext': ext,
f0b5d6af
PH
1094 'protocol': entry_protocol,
1095 'preference': preference,
704df56d 1096 }
704df56d
PH
1097 resolution = last_info.get('RESOLUTION')
1098 if resolution:
1099 width_str, height_str = resolution.split('x')
1100 f['width'] = int(width_str)
1101 f['height'] = int(height_str)
fbb6edd2
S
1102 codecs = last_info.get('CODECS')
1103 if codecs:
1104 vcodec, acodec = [None] * 2
1105 va_codecs = codecs.split(',')
1106 if len(va_codecs) == 1:
1107 # Audio only entries usually come with single codec and
1108 # no resolution. For more robustness we also check it to
1109 # be mp4 audio.
1110 if not resolution and va_codecs[0].startswith('mp4a'):
1111 vcodec, acodec = 'none', va_codecs[0]
1112 else:
1113 vcodec = va_codecs[0]
1114 else:
1115 vcodec, acodec = va_codecs[:2]
1116 f.update({
1117 'acodec': acodec,
1118 'vcodec': vcodec,
1119 })
4cd95bcb
JMF
1120 if last_media is not None:
1121 f['m3u8_media'] = last_media
1122 last_media = None
704df56d
PH
1123 formats.append(f)
1124 last_info = {}
1125 self._sort_formats(formats)
1126 return formats
1127
a107193e
S
1128 @staticmethod
1129 def _xpath_ns(path, namespace=None):
1130 if not namespace:
1131 return path
1132 out = []
1133 for c in path.split('/'):
1134 if not c or c == '.':
1135 out.append(c)
1136 else:
1137 out.append('{%s}%s' % (namespace, c))
1138 return '/'.join(out)
1139
1140 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1141 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1142
995029a1
PH
1143 if smil is False:
1144 assert not fatal
1145 return []
e89a2aab 1146
17712eeb 1147 namespace = self._parse_smil_namespace(smil)
a107193e
S
1148
1149 return self._parse_smil_formats(
1150 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1151
1152 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1153 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1154 if smil is False:
1155 return {}
1156 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1157
1158 def _download_smil(self, smil_url, video_id, fatal=True):
1159 return self._download_xml(
1160 smil_url, video_id, 'Downloading SMIL file',
1161 'Unable to download SMIL file', fatal=fatal)
1162
1163 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1164 namespace = self._parse_smil_namespace(smil)
a107193e
S
1165
1166 formats = self._parse_smil_formats(
1167 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1168 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1169
1170 video_id = os.path.splitext(url_basename(smil_url))[0]
1171 title = None
1172 description = None
647eab45 1173 upload_date = None
a107193e
S
1174 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1175 name = meta.attrib.get('name')
1176 content = meta.attrib.get('content')
1177 if not name or not content:
1178 continue
1179 if not title and name == 'title':
1180 title = content
1181 elif not description and name in ('description', 'abstract'):
1182 description = content
647eab45
S
1183 elif not upload_date and name == 'date':
1184 upload_date = unified_strdate(content)
a107193e 1185
1e5bcdec
S
1186 thumbnails = [{
1187 'id': image.get('type'),
1188 'url': image.get('src'),
1189 'width': int_or_none(image.get('width')),
1190 'height': int_or_none(image.get('height')),
1191 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1192
a107193e
S
1193 return {
1194 'id': video_id,
1195 'title': title or video_id,
1196 'description': description,
647eab45 1197 'upload_date': upload_date,
1e5bcdec 1198 'thumbnails': thumbnails,
a107193e
S
1199 'formats': formats,
1200 'subtitles': subtitles,
1201 }
1202
17712eeb
S
1203 def _parse_smil_namespace(self, smil):
1204 return self._search_regex(
1205 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1206
f877c6ae 1207 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1208 base = smil_url
1209 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1210 b = meta.get('base') or meta.get('httpBase')
1211 if b:
1212 base = b
1213 break
e89a2aab
S
1214
1215 formats = []
1216 rtmp_count = 0
a107193e 1217 http_count = 0
7f32e5dc 1218 m3u8_count = 0
a107193e 1219
81e1c4e2 1220 srcs = []
a107193e
S
1221 videos = smil.findall(self._xpath_ns('.//video', namespace))
1222 for video in videos:
1223 src = video.get('src')
81e1c4e2 1224 if not src or src in srcs:
a107193e 1225 continue
81e1c4e2 1226 srcs.append(src)
a107193e 1227
e7d8e98a 1228 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
a107193e
S
1229 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1230 width = int_or_none(video.get('width'))
1231 height = int_or_none(video.get('height'))
1232 proto = video.get('proto')
1233 ext = video.get('ext')
1234 src_ext = determine_ext(src)
1235 streamer = video.get('streamer') or base
1236
1237 if proto == 'rtmp' or streamer.startswith('rtmp'):
1238 rtmp_count += 1
1239 formats.append({
1240 'url': streamer,
1241 'play_path': src,
1242 'ext': 'flv',
1243 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1244 'tbr': bitrate,
1245 'filesize': filesize,
1246 'width': width,
1247 'height': height,
1248 })
f877c6ae
YCH
1249 if transform_rtmp_url:
1250 streamer, src = transform_rtmp_url(streamer, src)
1251 formats[-1].update({
1252 'url': streamer,
1253 'play_path': src,
1254 })
a107193e
S
1255 continue
1256
1257 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1258 src_url = src_url.strip()
a107193e
S
1259
1260 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1261 m3u8_formats = self._extract_m3u8_formats(
1262 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1263 if len(m3u8_formats) == 1:
1264 m3u8_count += 1
1265 m3u8_formats[0].update({
1266 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1267 'tbr': bitrate,
1268 'width': width,
1269 'height': height,
1270 })
1271 formats.extend(m3u8_formats)
a107193e
S
1272 continue
1273
1274 if src_ext == 'f4m':
1275 f4m_url = src_url
1276 if not f4m_params:
1277 f4m_params = {
1278 'hdcore': '3.2.0',
1279 'plugin': 'flowplayer-3.2.0.1',
1280 }
1281 f4m_url += '&' if '?' in f4m_url else '?'
41c3a5a7 1282 f4m_url += compat_urllib_parse.urlencode(f4m_params)
7e5edcfd 1283 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1284 continue
1285
c78e4817 1286 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1287 http_count += 1
1288 formats.append({
1289 'url': src_url,
1290 'ext': ext or src_ext or 'flv',
1291 'format_id': 'http-%d' % (bitrate or http_count),
1292 'tbr': bitrate,
1293 'filesize': filesize,
1294 'width': width,
1295 'height': height,
1296 })
1297 continue
63757032 1298
e89a2aab
S
1299 self._sort_formats(formats)
1300
1301 return formats
1302
ce00af87 1303 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1304 urls = []
a107193e
S
1305 subtitles = {}
1306 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1307 src = textstream.get('src')
d413095f 1308 if not src or src in urls:
a107193e 1309 continue
d413095f 1310 urls.append(src)
cafcf657 1311 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
03bc7237 1312 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1313 subtitles.setdefault(lang, []).append({
1314 'url': src,
1315 'ext': ext,
1316 })
1317 return subtitles
63757032 1318
942acef5
S
1319 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1320 xspf = self._download_xml(
8d6765cf 1321 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1322 'Unable to download xspf manifest', fatal=fatal)
1323 if xspf is False:
1324 return []
1325 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1326
942acef5 1327 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1328 NS_MAP = {
1329 'xspf': 'http://xspf.org/ns/0/',
1330 's1': 'http://static.streamone.nl/player/ns/0',
1331 }
1332
1333 entries = []
1334 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1335 title = xpath_text(
98044462 1336 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1337 description = xpath_text(
1338 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1339 thumbnail = xpath_text(
1340 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1341 duration = float_or_none(
1342 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1343
1344 formats = [{
1345 'url': location.text,
1346 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1347 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1348 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1349 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1350 self._sort_formats(formats)
1351
1352 entries.append({
1353 'id': playlist_id,
1354 'title': title,
1355 'description': description,
1356 'thumbnail': thumbnail,
1357 'duration': duration,
1358 'formats': formats,
1359 })
1360 return entries
1361
1bac3455 1362 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1363 res = self._download_webpage_handle(
1364 mpd_url, video_id,
1365 note=note or 'Downloading MPD manifest',
1366 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1367 fatal=fatal)
1bac3455 1368 if res is False:
2d2fa82d 1369 return []
1bac3455 1370 mpd, urlh = res
1371 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1372
91cb6b50 1373 return self._parse_mpd_formats(
1bac3455 1374 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
2d2fa82d 1375
91cb6b50 1376 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1bac3455 1377 if mpd_doc.get('type') == 'dynamic':
1378 return []
2d2fa82d 1379
91cb6b50 1380 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1381
1382 def _add_ns(path):
1383 return self._xpath_ns(path, namespace)
1384
675d0016 1385 def is_drm_protected(element):
1386 return element.find(_add_ns('ContentProtection')) is not None
1387
1bac3455 1388 def extract_multisegment_info(element, ms_parent_info):
1389 ms_info = ms_parent_info.copy()
f14be228 1390 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1391 if segment_list is not None:
f14be228 1392 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1393 if segment_urls_e:
1394 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
f14be228 1395 initialization = segment_list.find(_add_ns('Initialization'))
1bac3455 1396 if initialization is not None:
1397 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1398 else:
f14be228 1399 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1400 if segment_template is not None:
1401 start_number = segment_template.get('startNumber')
1402 if start_number:
1403 ms_info['start_number'] = int(start_number)
f14be228 1404 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1bac3455 1405 if segment_timeline is not None:
f14be228 1406 s_e = segment_timeline.findall(_add_ns('S'))
1bac3455 1407 if s_e:
1408 ms_info['total_number'] = 0
1409 for s in s_e:
1410 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1411 else:
1412 timescale = segment_template.get('timescale')
1413 if timescale:
1414 ms_info['timescale'] = int(timescale)
1415 segment_duration = segment_template.get('duration')
1416 if segment_duration:
1417 ms_info['segment_duration'] = int(segment_duration)
1418 media_template = segment_template.get('media')
1419 if media_template:
1420 ms_info['media_template'] = media_template
1421 initialization = segment_template.get('initialization')
1422 if initialization:
1423 ms_info['initialization_url'] = initialization
1424 else:
f14be228 1425 initialization = segment_template.find(_add_ns('Initialization'))
1bac3455 1426 if initialization is not None:
1427 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1428 return ms_info
b323e170 1429
1bac3455 1430 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1431 formats = []
f14be228 1432 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1433 period_duration = parse_duration(period.get('duration')) or mpd_duration
1434 period_ms_info = extract_multisegment_info(period, {
1435 'start_number': 1,
1436 'timescale': 1,
1437 })
f14be228 1438 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1439 if is_drm_protected(adaptation_set):
1440 continue
1bac3455 1441 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1442 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1443 if is_drm_protected(representation):
1444 continue
1bac3455 1445 representation_attrib = adaptation_set.attrib.copy()
1446 representation_attrib.update(representation.attrib)
1447 mime_type = representation_attrib.get('mimeType')
1448 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1449 if content_type == 'text':
1450 # TODO implement WebVTT downloading
1451 pass
1452 elif content_type == 'video' or content_type == 'audio':
1453 base_url = ''
1454 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1455 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1456 if base_url_e is not None:
1457 base_url = base_url_e.text + base_url
1458 if re.match(r'^https?://', base_url):
1459 break
bb20526b
S
1460 if mpd_base_url and not re.match(r'^https?://', base_url):
1461 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1462 mpd_base_url += '/'
1bac3455 1463 base_url = mpd_base_url + base_url
1464 representation_id = representation_attrib.get('id')
d577c796 1465 lang = representation_attrib.get('lang')
51e9094f 1466 url_el = representation.find(_add_ns('BaseURL'))
1467 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1bac3455 1468 f = {
154c209e 1469 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 1470 'url': base_url,
1471 'width': int_or_none(representation_attrib.get('width')),
1472 'height': int_or_none(representation_attrib.get('height')),
1473 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1474 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1475 'fps': int_or_none(representation_attrib.get('frameRate')),
1476 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1477 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
d577c796 1478 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 1479 'format_note': 'DASH %s' % content_type,
51e9094f 1480 'filesize': filesize,
1bac3455 1481 }
1482 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1483 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1484 if 'total_number' not in representation_ms_info and 'segment_duration':
6a3828fd 1485 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1486 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1bac3455 1487 media_template = representation_ms_info['media_template']
1488 media_template = media_template.replace('$RepresentationID$', representation_id)
53c269c6 1489 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1bac3455 1490 media_template.replace('$$', '$')
53c269c6 1491 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1bac3455 1492 if 'segment_urls' in representation_ms_info:
1493 f.update({
1494 'segment_urls': representation_ms_info['segment_urls'],
1495 'protocol': 'http_dash_segments',
df374b52 1496 })
1bac3455 1497 if 'initialization_url' in representation_ms_info:
1498 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1499 f.update({
1500 'initialization_url': initialization_url,
1501 })
1502 if not f.get('url'):
1503 f['url'] = initialization_url
1504 try:
1505 existing_format = next(
1506 fo for fo in formats
1507 if fo['format_id'] == representation_id)
1508 except StopIteration:
1509 full_info = formats_dict.get(representation_id, {}).copy()
1510 full_info.update(f)
1511 formats.append(full_info)
1512 else:
1513 existing_format.update(f)
17b598d3 1514 else:
1bac3455 1515 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
0826a0b5 1516 self._sort_formats(formats)
17b598d3
YCH
1517 return formats
1518
f4b1c7ad
PH
1519 def _live_title(self, name):
1520 """ Generate the title for a live video """
1521 now = datetime.datetime.now()
611c1dd9 1522 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
1523 return name + ' ' + now_str
1524
b14f3a4c
PH
1525 def _int(self, v, name, fatal=False, **kwargs):
1526 res = int_or_none(v, **kwargs)
1527 if 'get_attr' in kwargs:
1528 print(getattr(v, kwargs['get_attr']))
1529 if res is None:
1530 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1531 if fatal:
1532 raise ExtractorError(msg)
1533 else:
1534 self._downloader.report_warning(msg)
1535 return res
1536
1537 def _float(self, v, name, fatal=False, **kwargs):
1538 res = float_or_none(v, **kwargs)
1539 if res is None:
1540 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1541 if fatal:
1542 raise ExtractorError(msg)
1543 else:
1544 self._downloader.report_warning(msg)
1545 return res
1546
42939b61 1547 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1548 cookie = compat_cookiejar.Cookie(
1549 0, name, value, None, None, domain, None,
42939b61
JMF
1550 None, '/', True, False, expire_time, '', None, None, None)
1551 self._downloader.cookiejar.set_cookie(cookie)
1552
799207e8 1553 def _get_cookies(self, url):
1554 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 1555 req = sanitized_Request(url)
799207e8 1556 self._downloader.cookiejar.add_cookie_header(req)
1557 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1558
05900629
PH
1559 def get_testcases(self, include_onlymatching=False):
1560 t = getattr(self, '_TEST', None)
1561 if t:
1562 assert not hasattr(self, '_TESTS'), \
1563 '%s has _TEST and _TESTS' % type(self).__name__
1564 tests = [t]
1565 else:
1566 tests = getattr(self, '_TESTS', [])
1567 for t in tests:
1568 if not include_onlymatching and t.get('only_matching', False):
1569 continue
1570 t['name'] = type(self).__name__[:-len('IE')]
1571 yield t
1572
1573 def is_suitable(self, age_limit):
1574 """ Test whether the extractor is generally suitable for the given
1575 age limit (i.e. pornographic sites are not, all others usually are) """
1576
1577 any_restricted = False
1578 for tc in self.get_testcases(include_onlymatching=False):
1579 if 'playlist' in tc:
1580 tc = tc['playlist'][0]
1581 is_restricted = age_restricted(
1582 tc.get('info_dict', {}).get('age_limit'), age_limit)
1583 if not is_restricted:
1584 return True
1585 any_restricted = any_restricted or is_restricted
1586 return not any_restricted
1587
a504ced0 1588 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1589 if (self._downloader.params.get('writesubtitles', False) or
1590 self._downloader.params.get('listsubtitles')):
1591 return self._get_subtitles(*args, **kwargs)
1592 return {}
a504ced0
JMF
1593
1594 def _get_subtitles(self, *args, **kwargs):
611c1dd9 1595 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 1596
912e0b7e
YCH
1597 @staticmethod
1598 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1599 """ Merge subtitle items for one language. Items with duplicated URLs
1600 will be dropped. """
1601 list1_urls = set([item['url'] for item in subtitle_list1])
1602 ret = list(subtitle_list1)
1603 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1604 return ret
1605
1606 @classmethod
8c97f819 1607 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 1608 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
1609 ret = dict(subtitle_dict1)
1610 for lang in subtitle_dict2:
8c97f819 1611 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
1612 return ret
1613
360e1ca5 1614 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1615 if (self._downloader.params.get('writeautomaticsub', False) or
1616 self._downloader.params.get('listsubtitles')):
1617 return self._get_automatic_captions(*args, **kwargs)
1618 return {}
360e1ca5
JMF
1619
1620 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 1621 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 1622
d77ab8e2
S
1623 def mark_watched(self, *args, **kwargs):
1624 if (self._downloader.params.get('mark_watched', False) and
1625 (self._get_login_info()[0] is not None or
1626 self._downloader.params.get('cookiefile') is not None)):
1627 self._mark_watched(*args, **kwargs)
1628
1629 def _mark_watched(self, *args, **kwargs):
1630 raise NotImplementedError('This method must be implemented by subclasses')
1631
8dbe9899 1632
d6983cb4
PH
1633class SearchInfoExtractor(InfoExtractor):
1634 """
1635 Base class for paged search queries extractors.
10952eb2 1636 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1637 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1638 """
1639
1640 @classmethod
1641 def _make_valid_url(cls):
1642 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1643
1644 @classmethod
1645 def suitable(cls, url):
1646 return re.match(cls._make_valid_url(), url) is not None
1647
1648 def _real_extract(self, query):
1649 mobj = re.match(self._make_valid_url(), query)
1650 if mobj is None:
f1a9d64e 1651 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1652
1653 prefix = mobj.group('prefix')
1654 query = mobj.group('query')
1655 if prefix == '':
1656 return self._get_n_results(query, 1)
1657 elif prefix == 'all':
1658 return self._get_n_results(query, self._MAX_RESULTS)
1659 else:
1660 n = int(prefix)
1661 if n <= 0:
f1a9d64e 1662 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1663 elif n > self._MAX_RESULTS:
f1a9d64e 1664 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1665 n = self._MAX_RESULTS
1666 return self._get_n_results(query, n)
1667
1668 def _get_n_results(self, query, n):
1669 """Get a specified number of results for a query"""
611c1dd9 1670 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
1671
1672 @property
1673 def SEARCH_KEY(self):
1674 return self._SEARCH_KEY