]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[iqiyi] fix iqiyi (2015-07-17), update the md5 salt (enc_key) to iqiyi latest (2015...
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
96a53167 17 compat_HTTPError,
d6983cb4
PH
18 compat_http_client,
19 compat_urllib_error,
c7deaa4c 20 compat_urllib_parse_urlparse,
f0b5d6af 21 compat_urlparse,
d6983cb4 22 compat_str,
8c25f81b
PH
23)
24from ..utils import (
c342041f 25 NO_DEFAULT,
05900629 26 age_restricted,
08f2a92c 27 bug_reports_message,
d6983cb4
PH
28 clean_html,
29 compiled_regex_type,
70f0f5a8 30 determine_ext,
d6983cb4 31 ExtractorError,
97f4aecf 32 fix_xml_ampersands,
b14f3a4c 33 float_or_none,
31bb8d3f 34 int_or_none,
55b3e45b 35 RegexNotFoundError,
d41e6efc 36 sanitize_filename,
f38de77f 37 unescapeHTML,
d6983cb4 38)
c342041f 39
d6983cb4
PH
40
41class InfoExtractor(object):
42 """Information Extractor class.
43
44 Information extractors are the classes that, given a URL, extract
45 information about the video (or videos) the URL refers to. This
46 information includes the real video URL, the video title, author and
47 others. The information is stored in a dictionary which is then
5d380852 48 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
49 information possibly downloading the video to the file system, among
50 other possible outcomes.
51
cf0649f8 52 The type field determines the type of the result.
fed5d032
PH
53 By far the most common value (and the default if _type is missing) is
54 "video", which indicates a single video.
55
56 For a video, the dictionaries must include the following fields:
d6983cb4
PH
57
58 id: Video identifier.
d6983cb4 59 title: Video title, unescaped.
d67b0b15 60
f49d89ee 61 Additionally, it must contain either a formats entry or a url one:
d67b0b15 62
f49d89ee
PH
63 formats: A list of dictionaries for each format available, ordered
64 from worst to best quality.
65
66 Potential fields:
d67b0b15
PH
67 * url Mandatory. The URL of the video file
68 * ext Will be calculated from url if missing
69 * format A human-readable description of the format
70 ("mp4 container with h264/opus").
71 Calculated from the format_id, width, height.
72 and format_note fields if missing.
73 * format_id A short description of the format
5d4f3985
PH
74 ("mp4_h264_opus" or "19").
75 Technically optional, but strongly recommended.
d67b0b15
PH
76 * format_note Additional info about the format
77 ("3D" or "DASH video")
78 * width Width of the video, if known
79 * height Height of the video, if known
f49d89ee 80 * resolution Textual description of width and height
7217e148 81 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
82 * abr Average audio bitrate in KBit/s
83 * acodec Name of the audio codec in use
dd27fd17 84 * asr Audio sampling rate in Hertz
d67b0b15 85 * vbr Average video bitrate in KBit/s
fbb21cf5 86 * fps Frame rate
d67b0b15 87 * vcodec Name of the video codec in use
1394ce65 88 * container Name of the container format
d67b0b15 89 * filesize The number of bytes, if known in advance
9732d77e 90 * filesize_approx An estimate for the number of bytes
d67b0b15 91 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
92 * protocol The protocol that will be used for the actual
93 download, lower-case.
b04b8852
PH
94 "http", "https", "rtsp", "rtmp", "rtmpe",
95 "m3u8", or "m3u8_native".
f49d89ee 96 * preference Order number of this format. If this field is
08d13955 97 present and not None, the formats get sorted
38d63d84 98 by this field, regardless of all other values.
f49d89ee
PH
99 -1 for default (order by other properties),
100 -2 or smaller for less than default.
e65566a9
PH
101 < -1000 to hide the format (if there is
102 another one which is strictly better)
aff2f4f4
PH
103 * language_preference Is this in the correct requested
104 language?
105 10 if it's what the URL is about,
106 -1 for default (don't know),
107 -10 otherwise, other values reserved for now.
5d73273f
PH
108 * quality Order number of the video quality of this
109 format, irrespective of the file format.
110 -1 for default (order by other properties),
111 -2 or smaller for less than default.
c64ed2a3
PH
112 * source_preference Order number for this video source
113 (quality takes higher priority)
114 -1 for default (order by other properties),
115 -2 or smaller for less than default.
d769be6c
PH
116 * http_headers A dictionary of additional HTTP headers
117 to add to the request.
6271f1ca 118 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
119 video's pixels are not square.
120 width : height ratio as float.
121 * no_resume The server does not support resuming the
122 (HTTP or RTMP) download. Boolean.
123
c0ba0f48 124 url: Final video URL.
d6983cb4 125 ext: Video filename extension.
d67b0b15
PH
126 format: The video format, defaults to ext (used for --get-format)
127 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 128
d6983cb4
PH
129 The following fields are optional:
130
f5e43bc6 131 alt_title: A secondary title of the video.
0afef30b
PH
132 display_id An alternative identifier for the video, not necessarily
133 unique, but available before title. Typically, id is
134 something like "4234987", title "Dancing naked mole rats",
135 and display_id "dancing-naked-mole-rats"
d5519808 136 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 137 * "id" (optional, string) - Thumbnail format ID
d5519808 138 * "url"
cfb56d1a 139 * "preference" (optional, int) - quality of the image
d5519808
PH
140 * "width" (optional, int)
141 * "height" (optional, int)
142 * "resolution" (optional, string "{width}x{height"},
143 deprecated)
d6983cb4 144 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 145 description: Full video description.
d6983cb4 146 uploader: Full name of the video uploader.
9bb8e0a3 147 creator: The main artist who created the video.
955c4514 148 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 149 upload_date: Video upload date (YYYYMMDD).
955c4514 150 If not explicitly set, calculated from timestamp.
d6983cb4 151 uploader_id: Nickname or id of the video uploader.
da9ec3b9 152 location: Physical location where the video was filmed.
a504ced0
JMF
153 subtitles: The available subtitles as a dictionary in the format
154 {language: subformats}. "subformats" is a list sorted from
155 lower to higher preference, each element is a dictionary
156 with the "ext" entry and one of:
157 * "data": The subtitles file contents
158 * "url": A url pointing to the subtitles file
360e1ca5
JMF
159 automatic_captions: Like 'subtitles', used by the YoutubeIE for
160 automatically generated captions
c0ba0f48 161 duration: Length of the video in seconds, as an integer.
f3d29461 162 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
163 like_count: Number of positive ratings of the video
164 dislike_count: Number of negative ratings of the video
2d30521a 165 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 166 comment_count: Number of comments on the video
dd622d7c
PH
167 comments: A list of comments, each with one or more of the following
168 properties (all but one of text or html optional):
169 * "author" - human-readable name of the comment author
170 * "author_id" - user ID of the comment author
171 * "id" - Comment ID
172 * "html" - Comment as HTML
173 * "text" - Plain text of the comment
174 * "timestamp" - UNIX timestamp of comment
175 * "parent" - ID of the comment this one is replying to.
176 Set to "root" to indicate that this is a
177 comment to the original video.
8dbe9899 178 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
179 webpage_url: The url to the video webpage, if given to youtube-dl it
180 should allow to get the same result again. (It will be set
181 by YoutubeDL if it's missing)
ad3bc6ac
PH
182 categories: A list of categories that the video falls in, for example
183 ["Sports", "Berlin"]
7267bd53
PH
184 is_live: True, False, or None (=unknown). Whether this video is a
185 live stream that goes on instead of a fixed-length video.
d6983cb4 186
deefc05b 187 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 188
d838b1bd
PH
189 Unless mentioned otherwise, None is equivalent to absence of information.
190
fed5d032
PH
191
192 _type "playlist" indicates multiple videos.
b82f815f
PH
193 There must be a key "entries", which is a list, an iterable, or a PagedList
194 object, each element of which is a valid dictionary by this specification.
fed5d032
PH
195
196 Additionally, playlists can have "title" and "id" attributes with the same
197 semantics as videos (see above).
198
199
200 _type "multi_video" indicates that there are multiple videos that
201 form a single show, for examples multiple acts of an opera or TV episode.
202 It must have an entries key like a playlist and contain all the keys
203 required for a video at the same time.
204
205
206 _type "url" indicates that the video must be extracted from another
207 location, possibly by a different extractor. Its only required key is:
208 "url" - the next URL to extract.
f58766ce
PH
209 The key "ie_key" can be set to the class name (minus the trailing "IE",
210 e.g. "Youtube") if the extractor class is known in advance.
211 Additionally, the dictionary may have any properties of the resolved entity
212 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
213 known ahead of time.
214
215
216 _type "url_transparent" entities have the same specification as "url", but
217 indicate that the given additional information is more precise than the one
218 associated with the resolved URL.
219 This is useful when a site employs a video service that hosts the video and
220 its technical metadata, but that video service does not embed a useful
221 title, description etc.
222
223
d6983cb4
PH
224 Subclasses of this one should re-define the _real_initialize() and
225 _real_extract() methods and define a _VALID_URL regexp.
226 Probably, they should also be added to the list of extractors.
227
d6983cb4
PH
228 Finally, the _WORKING attribute should be set to False for broken IEs
229 in order to warn the users and skip the tests.
230 """
231
232 _ready = False
233 _downloader = None
234 _WORKING = True
235
236 def __init__(self, downloader=None):
237 """Constructor. Receives an optional downloader."""
238 self._ready = False
239 self.set_downloader(downloader)
240
241 @classmethod
242 def suitable(cls, url):
243 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
244
245 # This does not use has/getattr intentionally - we want to know whether
246 # we have cached the regexp for *this* class, whereas getattr would also
247 # match the superclass
248 if '_VALID_URL_RE' not in cls.__dict__:
249 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
250 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 251
ed9266db
PH
252 @classmethod
253 def _match_id(cls, url):
254 if '_VALID_URL_RE' not in cls.__dict__:
255 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
256 m = cls._VALID_URL_RE.match(url)
257 assert m
258 return m.group('id')
259
d6983cb4
PH
260 @classmethod
261 def working(cls):
262 """Getter method for _WORKING."""
263 return cls._WORKING
264
265 def initialize(self):
266 """Initializes an instance (authentication, etc)."""
267 if not self._ready:
268 self._real_initialize()
269 self._ready = True
270
271 def extract(self, url):
272 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
273 try:
274 self.initialize()
275 return self._real_extract(url)
276 except ExtractorError:
277 raise
278 except compat_http_client.IncompleteRead as e:
279 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 280 except (KeyError, StopIteration) as e:
3a5bcd03 281 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
282
283 def set_downloader(self, downloader):
284 """Sets the downloader for this IE."""
285 self._downloader = downloader
286
287 def _real_initialize(self):
288 """Real initialization process. Redefine in subclasses."""
289 pass
290
291 def _real_extract(self, url):
292 """Real extraction process. Redefine in subclasses."""
293 pass
294
56c73665
JMF
295 @classmethod
296 def ie_key(cls):
297 """A string for getting the InfoExtractor with get_info_extractor"""
298 return cls.__name__[:-2]
299
d6983cb4
PH
300 @property
301 def IE_NAME(self):
302 return type(self).__name__[:-2]
303
7cc3570e 304 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
305 """ Returns the response handle """
306 if note is None:
307 self.report_download_webpage(video_id)
308 elif note is not False:
7cc3570e 309 if video_id is None:
f1a9d64e 310 self.to_screen('%s' % (note,))
7cc3570e 311 else:
f1a9d64e 312 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 313 try:
dca08720 314 return self._downloader.urlopen(url_or_request)
d6983cb4 315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
316 if errnote is False:
317 return False
d6983cb4 318 if errnote is None:
f1a9d64e
PH
319 errnote = 'Unable to download webpage'
320 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
321 if fatal:
322 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
323 else:
324 self._downloader.report_warning(errmsg)
325 return False
d6983cb4 326
c9a77969 327 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
d6983cb4 328 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
329 # Strip hashes from the URL (#1038)
330 if isinstance(url_or_request, (compat_str, str)):
331 url_or_request = url_or_request.partition('#')[0]
332
7cc3570e
PH
333 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
334 if urlh is False:
335 assert not fatal
336 return False
c9a77969 337 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
338 return (content, urlh)
339
c9a77969
YCH
340 @staticmethod
341 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
342 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
343 if m:
344 encoding = m.group(1)
345 else:
0d75ae2c 346 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
347 webpage_bytes[:1024])
348 if m:
349 encoding = m.group(1).decode('ascii')
b60016e8
PH
350 elif webpage_bytes.startswith(b'\xff\xfe'):
351 encoding = 'utf-16'
f143d86a
PH
352 else:
353 encoding = 'utf-8'
c9a77969
YCH
354
355 return encoding
356
357 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
358 content_type = urlh.headers.get('Content-Type', '')
359 webpage_bytes = urlh.read()
360 if prefix is not None:
361 webpage_bytes = prefix + webpage_bytes
362 if not encoding:
363 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
364 if self._downloader.params.get('dump_intermediate_pages', False):
365 try:
366 url = url_or_request.get_full_url()
367 except AttributeError:
368 url = url_or_request
f1a9d64e 369 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
370 dump = base64.b64encode(webpage_bytes).decode('ascii')
371 self._downloader.to_screen(dump)
d41e6efc
PH
372 if self._downloader.params.get('write_pages', False):
373 try:
374 url = url_or_request.get_full_url()
375 except AttributeError:
376 url = url_or_request
5afa7f8b 377 basen = '%s_%s' % (video_id, url)
c1bce22f 378 if len(basen) > 240:
f1a9d64e 379 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
380 basen = basen[:240 - len(h)] + h
381 raw_filename = basen + '.dump'
d41e6efc 382 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 383 self.to_screen('Saving request to ' + filename)
5f58165d
S
384 # Working around MAX_PATH limitation on Windows (see
385 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
386 if os.name == 'nt':
387 absfilepath = os.path.abspath(filename)
388 if len(absfilepath) > 259:
389 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
390 with open(filename, 'wb') as outf:
391 outf.write(webpage_bytes)
392
ec0fafbb
AA
393 try:
394 content = webpage_bytes.decode(encoding, 'replace')
395 except LookupError:
396 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 397
f1a9d64e
PH
398 if ('<title>Access to this site is blocked</title>' in content and
399 'Websense' in content[:512]):
400 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
401 blocked_iframe = self._html_search_regex(
402 r'<iframe src="([^"]+)"', content,
f1a9d64e 403 'Websense information URL', default=None)
2410c43d 404 if blocked_iframe:
f1a9d64e 405 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 406 raise ExtractorError(msg, expected=True)
77b2986b
PH
407 if '<title>The URL you requested has been blocked</title>' in content[:512]:
408 msg = (
409 'Access to this webpage has been blocked by Indian censorship. '
410 'Use a VPN or proxy server (with --proxy) to route around it.')
411 block_msg = self._html_search_regex(
412 r'</h1><p>(.*?)</p>',
413 content, 'block message', default=None)
414 if block_msg:
415 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
416 raise ExtractorError(msg, expected=True)
2410c43d 417
23be51d8 418 return content
d6983cb4 419
c9a77969 420 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
d6983cb4 421 """ Returns the data of the page as a string """
995ad69c
TF
422 success = False
423 try_count = 0
424 while success is False:
425 try:
c9a77969 426 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
995ad69c
TF
427 success = True
428 except compat_http_client.IncompleteRead as e:
429 try_count += 1
430 if try_count >= tries:
431 raise e
432 self._sleep(timeout, video_id)
7cc3570e
PH
433 if res is False:
434 return res
435 else:
436 content, _ = res
437 return content
d6983cb4 438
2a275ab0 439 def _download_xml(self, url_or_request, video_id,
f1a9d64e 440 note='Downloading XML', errnote='Unable to download XML',
c9a77969 441 transform_source=None, fatal=True, encoding=None):
267ed0c5 442 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 443 xml_string = self._download_webpage(
c9a77969 444 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
28746fbd
PH
445 if xml_string is False:
446 return xml_string
e2b38da9
PH
447 if transform_source:
448 xml_string = transform_source(xml_string)
267ed0c5
JMF
449 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
450
3d3538e4 451 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
452 note='Downloading JSON metadata',
453 errnote='Unable to download JSON metadata',
b090af59 454 transform_source=None,
c9a77969 455 fatal=True, encoding=None):
b090af59 456 json_string = self._download_webpage(
c9a77969
YCH
457 url_or_request, video_id, note, errnote, fatal=fatal,
458 encoding=encoding)
b090af59
PH
459 if (not fatal) and json_string is False:
460 return None
ebb64199
TF
461 return self._parse_json(
462 json_string, video_id, transform_source=transform_source, fatal=fatal)
463
464 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
465 if transform_source:
466 json_string = transform_source(json_string)
3d3538e4
PH
467 try:
468 return json.loads(json_string)
469 except ValueError as ve:
e7b6d122
PH
470 errmsg = '%s: Failed to parse JSON ' % video_id
471 if fatal:
472 raise ExtractorError(errmsg, cause=ve)
473 else:
474 self.report_warning(errmsg + str(ve))
3d3538e4 475
f45f96f8 476 def report_warning(self, msg, video_id=None):
f1a9d64e 477 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 478 self._downloader.report_warning(
f1a9d64e 479 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 480
d6983cb4
PH
481 def to_screen(self, msg):
482 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 483 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
484
485 def report_extraction(self, id_or_name):
486 """Report information extraction."""
f1a9d64e 487 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
488
489 def report_download_webpage(self, video_id):
490 """Report webpage download."""
f1a9d64e 491 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
492
493 def report_age_confirmation(self):
494 """Report attempt to confirm age."""
f1a9d64e 495 self.to_screen('Confirming age')
d6983cb4 496
fc79158d
JMF
497 def report_login(self):
498 """Report attempt to log in."""
f1a9d64e 499 self.to_screen('Logging in')
fc79158d 500
5f6a1245 501 # Methods for following #608
c0d0b01f 502 @staticmethod
830d53bf 503 def url_result(url, ie=None, video_id=None, video_title=None):
d6983cb4 504 """Returns a url that points to a page that should be processed"""
5f6a1245 505 # TODO: ie should be the class used for getting the info
d6983cb4
PH
506 video_info = {'_type': 'url',
507 'url': url,
508 'ie_key': ie}
7012b23c
PH
509 if video_id is not None:
510 video_info['id'] = video_id
830d53bf
S
511 if video_title is not None:
512 video_info['title'] = video_title
d6983cb4 513 return video_info
5f6a1245 514
c0d0b01f 515 @staticmethod
acf5cbfe 516 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
517 """Returns a playlist"""
518 video_info = {'_type': 'playlist',
519 'entries': entries}
520 if playlist_id:
521 video_info['id'] = playlist_id
522 if playlist_title:
523 video_info['title'] = playlist_title
acf5cbfe
S
524 if playlist_description:
525 video_info['description'] = playlist_description
d6983cb4
PH
526 return video_info
527
c342041f 528 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
529 """
530 Perform a regex search on the given string, using a single or a list of
531 patterns returning the first matching group.
532 In case of failure return a default value or raise a WARNING or a
55b3e45b 533 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
534 """
535 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
536 mobj = re.search(pattern, string, flags)
537 else:
538 for p in pattern:
539 mobj = re.search(p, string, flags)
c3415d1b
PH
540 if mobj:
541 break
d6983cb4 542
7e5db8c9 543 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 544 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
545 else:
546 _name = name
547
548 if mobj:
711ede6e
PH
549 if group is None:
550 # return the first matching group
551 return next(g for g in mobj.groups() if g is not None)
552 else:
553 return mobj.group(group)
c342041f 554 elif default is not NO_DEFAULT:
d6983cb4
PH
555 return default
556 elif fatal:
f1a9d64e 557 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 558 else:
08f2a92c 559 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
560 return None
561
c342041f 562 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
563 """
564 Like _search_regex, but strips HTML tags and unescapes entities.
565 """
711ede6e 566 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
567 if res:
568 return clean_html(res).strip()
569 else:
570 return res
571
fc79158d
JMF
572 def _get_login_info(self):
573 """
cf0649f8 574 Get the login info as (username, password)
fc79158d
JMF
575 It will look in the netrc file using the _NETRC_MACHINE value
576 If there's no info available, return (None, None)
577 """
578 if self._downloader is None:
579 return (None, None)
580
581 username = None
582 password = None
583 downloader_params = self._downloader.params
584
585 # Attempt to use provided username and password or .netrc data
586 if downloader_params.get('username', None) is not None:
587 username = downloader_params['username']
588 password = downloader_params['password']
589 elif downloader_params.get('usenetrc', False):
590 try:
591 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
592 if info is not None:
593 username = info[0]
594 password = info[2]
595 else:
596 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
597 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 598 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 599
fc79158d
JMF
600 return (username, password)
601
83317f69 602 def _get_tfa_info(self):
603 """
604 Get the two-factor authentication info
605 TODO - asking the user will be required for sms/phone verify
606 currently just uses the command line option
607 If there's no info available, return None
608 """
609 if self._downloader is None:
83317f69 610 return None
611 downloader_params = self._downloader.params
612
613 if downloader_params.get('twofactor', None) is not None:
614 return downloader_params['twofactor']
615
83317f69 616 return None
617
46720279
JMF
618 # Helper functions for extracting OpenGraph info
619 @staticmethod
ab2d5247 620 def _og_regexes(prop):
c1206423 621 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 622 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 623 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 624 return [
78fb87b2
JMF
625 template % (property_re, content_re),
626 template % (content_re, property_re),
ab2d5247 627 ]
46720279 628
3c4e6d83 629 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 630 if name is None:
3c4e6d83 631 name = 'OpenGraph %s' % prop
ab2d5247 632 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
633 if escaped is None:
634 return None
635 return unescapeHTML(escaped)
46720279
JMF
636
637 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 638 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
639
640 def _og_search_description(self, html, **kargs):
641 return self._og_search_property('description', html, fatal=False, **kargs)
642
643 def _og_search_title(self, html, **kargs):
644 return self._og_search_property('title', html, **kargs)
645
8ffa13e0 646 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
647 regexes = self._og_regexes('video') + self._og_regexes('video:url')
648 if secure:
649 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 650 return self._html_search_regex(regexes, html, name, **kargs)
46720279 651
78338f71
JMF
652 def _og_search_url(self, html, **kargs):
653 return self._og_search_property('url', html, **kargs)
654
40c696e5 655 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
656 if display_name is None:
657 display_name = name
658 return self._html_search_regex(
6c6f1408 659 r'''(?isx)<meta
711ede6e 660 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
bec22481 661 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
711ede6e 662 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
663
664 def _dc_search_uploader(self, html):
665 return self._html_search_meta('dc.creator', html, 'uploader')
666
8dbe9899
PH
667 def _rta_search(self, html):
668 # See http://www.rtalabel.org/index.php?content=howtofaq#single
669 if re.search(r'(?ix)<meta\s+name="rating"\s+'
670 r' content="RTA-5042-1996-1400-1577-RTA"',
671 html):
672 return 18
673 return 0
674
59040888
PH
675 def _media_rating_search(self, html):
676 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
677 rating = self._html_search_meta('rating', html)
678
679 if not rating:
680 return None
681
682 RATING_TABLE = {
683 'safe for kids': 0,
684 'general': 8,
685 '14 years': 14,
686 'mature': 17,
687 'restricted': 19,
688 }
689 return RATING_TABLE.get(rating.lower(), None)
690
69319969 691 def _family_friendly_search(self, html):
6ca7732d 692 # See http://schema.org/VideoObject
69319969
NJ
693 family_friendly = self._html_search_meta('isFamilyFriendly', html)
694
695 if not family_friendly:
696 return None
697
698 RATING_TABLE = {
699 '1': 0,
700 'true': 0,
701 '0': 18,
702 'false': 18,
703 }
704 return RATING_TABLE.get(family_friendly.lower(), None)
705
0c708f11
JMF
706 def _twitter_search_player(self, html):
707 return self._html_search_meta('twitter:player', html,
9e1a5b84 708 'twitter card player')
0c708f11 709
27713812 710 @staticmethod
f8da79f8
S
711 def _hidden_inputs(html):
712 return dict([
713 (input.group('name'), input.group('value')) for input in re.finditer(
714 r'''(?x)
715 <input\s+
716 type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
717 name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
718 (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
719 value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
720 ''', html)
721 ])
27713812 722
cf61d96d
S
723 def _form_hidden_inputs(self, form_id, html):
724 form = self._search_regex(
725 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
726 html, '%s form' % form_id, group='form')
727 return self._hidden_inputs(form)
728
3ded7bac 729 def _sort_formats(self, formats, field_preference=None):
7e8caf30 730 if not formats:
f1a9d64e 731 raise ExtractorError('No video formats found')
7e8caf30 732
4bcc7bd1 733 def _formats_key(f):
e6812ac9
PH
734 # TODO remove the following workaround
735 from ..utils import determine_ext
736 if not f.get('ext') and 'url' in f:
737 f['ext'] = determine_ext(f['url'])
738
3ded7bac
S
739 if isinstance(field_preference, (list, tuple)):
740 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
741
4bcc7bd1
PH
742 preference = f.get('preference')
743 if preference is None:
c7deaa4c
PH
744 proto = f.get('protocol')
745 if proto is None:
746 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
747
748 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
749 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
750 preference -= 0.5
751
752 if f.get('vcodec') == 'none': # audio only
753 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 754 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 755 else:
f1a9d64e 756 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
757 ext_preference = 0
758 try:
759 audio_ext_preference = ORDER.index(f['ext'])
760 except ValueError:
761 audio_ext_preference = -1
762 else:
763 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 764 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 765 else:
f1a9d64e 766 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
767 try:
768 ext_preference = ORDER.index(f['ext'])
769 except ValueError:
770 ext_preference = -1
771 audio_ext_preference = 0
772
773 return (
774 preference,
aff2f4f4 775 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 776 f.get('quality') if f.get('quality') is not None else -1,
9933b574 777 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 778 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 779 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
780 f.get('height') if f.get('height') is not None else -1,
781 f.get('width') if f.get('width') is not None else -1,
1e1896f2 782 ext_preference,
4bcc7bd1
PH
783 f.get('abr') if f.get('abr') is not None else -1,
784 audio_ext_preference,
2c8e03d9 785 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 786 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 787 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 788 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
789 )
790 formats.sort(key=_formats_key)
59040888 791
96a53167
S
792 def _check_formats(self, formats, video_id):
793 if formats:
794 formats[:] = filter(
795 lambda f: self._is_valid_url(
796 f['url'], video_id,
797 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
798 formats)
799
800 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
801 url = self._proto_relative_url(url, scheme='http:')
802 # For now assume non HTTP(S) URLs always valid
803 if not (url.startswith('http://') or url.startswith('https://')):
804 return True
96a53167 805 try:
4069766c 806 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
807 return True
808 except ExtractorError as e:
809 if isinstance(e.cause, compat_HTTPError):
baa43cba
S
810 self.to_screen(
811 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
812 return False
813 raise
814
20991253 815 def http_scheme(self):
1ede5b24 816 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
817 return (
818 'http:'
819 if self._downloader.params.get('prefer_insecure', False)
820 else 'https:')
821
57c7411f
PH
822 def _proto_relative_url(self, url, scheme=None):
823 if url is None:
824 return url
825 if url.startswith('//'):
826 if scheme is None:
827 scheme = self.http_scheme()
828 return scheme + url
829 else:
830 return url
831
4094b6e3
PH
832 def _sleep(self, timeout, video_id, msg_template=None):
833 if msg_template is None:
f1a9d64e 834 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
835 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
836 self.to_screen(msg)
837 time.sleep(timeout)
838
a38436e8
YCH
839 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
840 transform_source=lambda s: fix_xml_ampersands(s).strip()):
f036a632
JMF
841 manifest = self._download_xml(
842 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
843 'Unable to download f4m manifest',
844 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
845 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
a38436e8 846 transform_source=transform_source)
31bb8d3f
JMF
847
848 formats = []
7a47d07c 849 manifest_version = '1.0'
b2527359 850 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 851 if not media_nodes:
7a47d07c 852 manifest_version = '2.0'
34e48bed 853 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b2527359 854 for i, media_el in enumerate(media_nodes):
7a47d07c 855 if manifest_version == '2.0':
31c746e5
S
856 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
857 if not media_url:
858 continue
cc357c4d
S
859 manifest_url = (
860 media_url if media_url.startswith('http://') or media_url.startswith('https://')
861 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
70f0f5a8
S
862 # If media_url is itself a f4m manifest do the recursive extraction
863 # since bitrates in parent manifest (this one) and media_url manifest
864 # may differ leading to inability to resolve the format by requested
865 # bitrate in f4m downloader
866 if determine_ext(manifest_url) == 'f4m':
867 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
868 continue
b2527359 869 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 870 formats.append({
e21a55ab 871 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
872 'url': manifest_url,
873 'ext': 'flv',
b2527359 874 'tbr': tbr,
31bb8d3f
JMF
875 'width': int_or_none(media_el.attrib.get('width')),
876 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 877 'preference': preference,
31bb8d3f
JMF
878 })
879 self._sort_formats(formats)
880
881 return formats
882
f0b5d6af 883 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 884 entry_protocol='m3u8', preference=None,
13af92fd
YCH
885 m3u8_id=None, note=None, errnote=None,
886 fatal=True):
f0b5d6af 887
704df56d 888 formats = [{
f207019c 889 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
890 'url': m3u8_url,
891 'ext': ext,
892 'protocol': 'm3u8',
9fe6ef7a 893 'preference': preference - 1 if preference else -1,
704df56d
PH
894 'resolution': 'multiple',
895 'format_note': 'Quality selection URL',
896 }]
897
f0b5d6af
PH
898 format_url = lambda u: (
899 u
900 if re.match(r'^https?://', u)
901 else compat_urlparse.urljoin(m3u8_url, u))
902
81515ad9
PH
903 m3u8_doc = self._download_webpage(
904 m3u8_url, video_id,
621ed9f5 905 note=note or 'Downloading m3u8 information',
13af92fd
YCH
906 errnote=errnote or 'Failed to download m3u8 information',
907 fatal=fatal)
908 if m3u8_doc is False:
909 return m3u8_doc
704df56d 910 last_info = None
fa156077 911 last_media = None
704df56d
PH
912 kv_rex = re.compile(
913 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
914 for line in m3u8_doc.splitlines():
915 if line.startswith('#EXT-X-STREAM-INF:'):
916 last_info = {}
917 for m in kv_rex.finditer(line):
918 v = m.group('val')
919 if v.startswith('"'):
920 v = v[1:-1]
921 last_info[m.group('key')] = v
4cd95bcb
JMF
922 elif line.startswith('#EXT-X-MEDIA:'):
923 last_media = {}
924 for m in kv_rex.finditer(line):
925 v = m.group('val')
926 if v.startswith('"'):
927 v = v[1:-1]
928 last_media[m.group('key')] = v
704df56d
PH
929 elif line.startswith('#') or not line.strip():
930 continue
931 else:
daebaab6 932 if last_info is None:
f0b5d6af 933 formats.append({'url': format_url(line)})
3524cc25 934 continue
704df56d 935 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
936 format_id = []
937 if m3u8_id:
938 format_id.append(m3u8_id)
05d5392c 939 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 940 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 941 f = {
8dc9d361 942 'format_id': '-'.join(format_id),
f0b5d6af 943 'url': format_url(line.strip()),
704df56d
PH
944 'tbr': tbr,
945 'ext': ext,
f0b5d6af
PH
946 'protocol': entry_protocol,
947 'preference': preference,
704df56d
PH
948 }
949 codecs = last_info.get('CODECS')
950 if codecs:
9ebf22b7
S
951 # TODO: looks like video codec is not always necessarily goes first
952 va_codecs = codecs.split(',')
953 if va_codecs[0]:
954 f['vcodec'] = va_codecs[0].partition('.')[0]
955 if len(va_codecs) > 1 and va_codecs[1]:
956 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
957 resolution = last_info.get('RESOLUTION')
958 if resolution:
959 width_str, height_str = resolution.split('x')
960 f['width'] = int(width_str)
961 f['height'] = int(height_str)
4cd95bcb
JMF
962 if last_media is not None:
963 f['m3u8_media'] = last_media
964 last_media = None
704df56d
PH
965 formats.append(f)
966 last_info = {}
967 self._sort_formats(formats)
968 return formats
969
e89a2aab 970 # TODO: improve extraction
995029a1 971 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
e89a2aab
S
972 smil = self._download_xml(
973 smil_url, video_id, 'Downloading SMIL file',
995029a1
PH
974 'Unable to download SMIL file', fatal=fatal)
975 if smil is False:
976 assert not fatal
977 return []
e89a2aab
S
978
979 base = smil.find('./head/meta').get('base')
980
981 formats = []
982 rtmp_count = 0
63757032
AA
983 if smil.findall('./body/seq/video'):
984 video = smil.findall('./body/seq/video')[0]
72a406e7 985 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
63757032
AA
986 formats.extend(fmts)
987 else:
988 for video in smil.findall('./body/switch/video'):
72a406e7 989 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
63757032
AA
990 formats.extend(fmts)
991
e89a2aab
S
992 self._sort_formats(formats)
993
994 return formats
995
72a406e7 996 def _parse_smil_video(self, video, video_id, base, rtmp_count):
63757032
AA
997 src = video.get('src')
998 if not src:
999 return ([], rtmp_count)
1000 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1001 width = int_or_none(video.get('width'))
1002 height = int_or_none(video.get('height'))
1003 proto = video.get('proto')
1004 if not proto:
1005 if base:
1006 if base.startswith('rtmp'):
1007 proto = 'rtmp'
1008 elif base.startswith('http'):
1009 proto = 'http'
1010 ext = video.get('ext')
1011 if proto == 'm3u8':
1012 return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
1013 elif proto == 'rtmp':
1014 rtmp_count += 1
1015 streamer = video.get('streamer') or base
1016 return ([{
1017 'url': streamer,
1018 'play_path': src,
1019 'ext': 'flv',
1020 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1021 'tbr': bitrate,
1022 'width': width,
1023 'height': height,
1024 }], rtmp_count)
6f4ba540
AA
1025 elif proto.startswith('http'):
1026 return ([{
1027 'url': base + src,
1028 'ext': ext or 'flv',
1029 'tbr': bitrate,
1030 'width': width,
1031 'height': height,
1032 }], rtmp_count)
63757032 1033
f4b1c7ad
PH
1034 def _live_title(self, name):
1035 """ Generate the title for a live video """
1036 now = datetime.datetime.now()
1037 now_str = now.strftime("%Y-%m-%d %H:%M")
1038 return name + ' ' + now_str
1039
b14f3a4c
PH
1040 def _int(self, v, name, fatal=False, **kwargs):
1041 res = int_or_none(v, **kwargs)
1042 if 'get_attr' in kwargs:
1043 print(getattr(v, kwargs['get_attr']))
1044 if res is None:
1045 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1046 if fatal:
1047 raise ExtractorError(msg)
1048 else:
1049 self._downloader.report_warning(msg)
1050 return res
1051
1052 def _float(self, v, name, fatal=False, **kwargs):
1053 res = float_or_none(v, **kwargs)
1054 if res is None:
1055 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1056 if fatal:
1057 raise ExtractorError(msg)
1058 else:
1059 self._downloader.report_warning(msg)
1060 return res
1061
42939b61 1062 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1063 cookie = compat_cookiejar.Cookie(
1064 0, name, value, None, None, domain, None,
42939b61
JMF
1065 None, '/', True, False, expire_time, '', None, None, None)
1066 self._downloader.cookiejar.set_cookie(cookie)
1067
05900629
PH
1068 def get_testcases(self, include_onlymatching=False):
1069 t = getattr(self, '_TEST', None)
1070 if t:
1071 assert not hasattr(self, '_TESTS'), \
1072 '%s has _TEST and _TESTS' % type(self).__name__
1073 tests = [t]
1074 else:
1075 tests = getattr(self, '_TESTS', [])
1076 for t in tests:
1077 if not include_onlymatching and t.get('only_matching', False):
1078 continue
1079 t['name'] = type(self).__name__[:-len('IE')]
1080 yield t
1081
1082 def is_suitable(self, age_limit):
1083 """ Test whether the extractor is generally suitable for the given
1084 age limit (i.e. pornographic sites are not, all others usually are) """
1085
1086 any_restricted = False
1087 for tc in self.get_testcases(include_onlymatching=False):
1088 if 'playlist' in tc:
1089 tc = tc['playlist'][0]
1090 is_restricted = age_restricted(
1091 tc.get('info_dict', {}).get('age_limit'), age_limit)
1092 if not is_restricted:
1093 return True
1094 any_restricted = any_restricted or is_restricted
1095 return not any_restricted
1096
a504ced0 1097 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1098 if (self._downloader.params.get('writesubtitles', False) or
1099 self._downloader.params.get('listsubtitles')):
1100 return self._get_subtitles(*args, **kwargs)
1101 return {}
a504ced0
JMF
1102
1103 def _get_subtitles(self, *args, **kwargs):
1104 raise NotImplementedError("This method must be implemented by subclasses")
1105
360e1ca5 1106 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1107 if (self._downloader.params.get('writeautomaticsub', False) or
1108 self._downloader.params.get('listsubtitles')):
1109 return self._get_automatic_captions(*args, **kwargs)
1110 return {}
360e1ca5
JMF
1111
1112 def _get_automatic_captions(self, *args, **kwargs):
1113 raise NotImplementedError("This method must be implemented by subclasses")
1114
8dbe9899 1115
d6983cb4
PH
1116class SearchInfoExtractor(InfoExtractor):
1117 """
1118 Base class for paged search queries extractors.
1119 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1120 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1121 """
1122
1123 @classmethod
1124 def _make_valid_url(cls):
1125 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1126
1127 @classmethod
1128 def suitable(cls, url):
1129 return re.match(cls._make_valid_url(), url) is not None
1130
1131 def _real_extract(self, query):
1132 mobj = re.match(self._make_valid_url(), query)
1133 if mobj is None:
f1a9d64e 1134 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1135
1136 prefix = mobj.group('prefix')
1137 query = mobj.group('query')
1138 if prefix == '':
1139 return self._get_n_results(query, 1)
1140 elif prefix == 'all':
1141 return self._get_n_results(query, self._MAX_RESULTS)
1142 else:
1143 n = int(prefix)
1144 if n <= 0:
f1a9d64e 1145 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1146 elif n > self._MAX_RESULTS:
f1a9d64e 1147 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1148 n = self._MAX_RESULTS
1149 return self._get_n_results(query, n)
1150
1151 def _get_n_results(self, query, n):
1152 """Get a specified number of results for a query"""
416a5efc 1153 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1154
1155 @property
1156 def SEARCH_KEY(self):
1157 return self._SEARCH_KEY