]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Merge pull request #6533 from sceext2/fix-iqiyi-2015-08-10
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_cookies,
18 compat_HTTPError,
19 compat_http_client,
20 compat_urllib_error,
21 compat_urllib_parse,
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
24 compat_urlparse,
25 compat_str,
26 )
27 from ..utils import (
28 NO_DEFAULT,
29 age_restricted,
30 bug_reports_message,
31 clean_html,
32 compiled_regex_type,
33 determine_ext,
34 ExtractorError,
35 fix_xml_ampersands,
36 float_or_none,
37 int_or_none,
38 RegexNotFoundError,
39 sanitize_filename,
40 unescapeHTML,
41 url_basename,
42 xpath_text,
43 xpath_with_ns,
44 )
45
46
47 class InfoExtractor(object):
48 """Information Extractor class.
49
50 Information extractors are the classes that, given a URL, extract
51 information about the video (or videos) the URL refers to. This
52 information includes the real video URL, the video title, author and
53 others. The information is stored in a dictionary which is then
54 passed to the YoutubeDL. The YoutubeDL processes this
55 information possibly downloading the video to the file system, among
56 other possible outcomes.
57
58 The type field determines the type of the result.
59 By far the most common value (and the default if _type is missing) is
60 "video", which indicates a single video.
61
62 For a video, the dictionaries must include the following fields:
63
64 id: Video identifier.
65 title: Video title, unescaped.
66
67 Additionally, it must contain either a formats entry or a url one:
68
69 formats: A list of dictionaries for each format available, ordered
70 from worst to best quality.
71
72 Potential fields:
73 * url Mandatory. The URL of the video file
74 * ext Will be calculated from URL if missing
75 * format A human-readable description of the format
76 ("mp4 container with h264/opus").
77 Calculated from the format_id, width, height.
78 and format_note fields if missing.
79 * format_id A short description of the format
80 ("mp4_h264_opus" or "19").
81 Technically optional, but strongly recommended.
82 * format_note Additional info about the format
83 ("3D" or "DASH video")
84 * width Width of the video, if known
85 * height Height of the video, if known
86 * resolution Textual description of width and height
87 * tbr Average bitrate of audio and video in KBit/s
88 * abr Average audio bitrate in KBit/s
89 * acodec Name of the audio codec in use
90 * asr Audio sampling rate in Hertz
91 * vbr Average video bitrate in KBit/s
92 * fps Frame rate
93 * vcodec Name of the video codec in use
94 * container Name of the container format
95 * filesize The number of bytes, if known in advance
96 * filesize_approx An estimate for the number of bytes
97 * player_url SWF Player URL (used for rtmpdump).
98 * protocol The protocol that will be used for the actual
99 download, lower-case.
100 "http", "https", "rtsp", "rtmp", "rtmpe",
101 "m3u8", or "m3u8_native".
102 * preference Order number of this format. If this field is
103 present and not None, the formats get sorted
104 by this field, regardless of all other values.
105 -1 for default (order by other properties),
106 -2 or smaller for less than default.
107 < -1000 to hide the format (if there is
108 another one which is strictly better)
109 * language_preference Is this in the correct requested
110 language?
111 10 if it's what the URL is about,
112 -1 for default (don't know),
113 -10 otherwise, other values reserved for now.
114 * quality Order number of the video quality of this
115 format, irrespective of the file format.
116 -1 for default (order by other properties),
117 -2 or smaller for less than default.
118 * source_preference Order number for this video source
119 (quality takes higher priority)
120 -1 for default (order by other properties),
121 -2 or smaller for less than default.
122 * http_headers A dictionary of additional HTTP headers
123 to add to the request.
124 * stretched_ratio If given and not 1, indicates that the
125 video's pixels are not square.
126 width : height ratio as float.
127 * no_resume The server does not support resuming the
128 (HTTP or RTMP) download. Boolean.
129
130 url: Final video URL.
131 ext: Video filename extension.
132 format: The video format, defaults to ext (used for --get-format)
133 player_url: SWF Player URL (used for rtmpdump).
134
135 The following fields are optional:
136
137 alt_title: A secondary title of the video.
138 display_id An alternative identifier for the video, not necessarily
139 unique, but available before title. Typically, id is
140 something like "4234987", title "Dancing naked mole rats",
141 and display_id "dancing-naked-mole-rats"
142 thumbnails: A list of dictionaries, with the following entries:
143 * "id" (optional, string) - Thumbnail format ID
144 * "url"
145 * "preference" (optional, int) - quality of the image
146 * "width" (optional, int)
147 * "height" (optional, int)
148 * "resolution" (optional, string "{width}x{height"},
149 deprecated)
150 thumbnail: Full URL to a video thumbnail image.
151 description: Full video description.
152 uploader: Full name of the video uploader.
153 creator: The main artist who created the video.
154 timestamp: UNIX timestamp of the moment the video became available.
155 upload_date: Video upload date (YYYYMMDD).
156 If not explicitly set, calculated from timestamp.
157 uploader_id: Nickname or id of the video uploader.
158 location: Physical location where the video was filmed.
159 subtitles: The available subtitles as a dictionary in the format
160 {language: subformats}. "subformats" is a list sorted from
161 lower to higher preference, each element is a dictionary
162 with the "ext" entry and one of:
163 * "data": The subtitles file contents
164 * "url": A URL pointing to the subtitles file
165 automatic_captions: Like 'subtitles', used by the YoutubeIE for
166 automatically generated captions
167 duration: Length of the video in seconds, as an integer.
168 view_count: How many users have watched the video on the platform.
169 like_count: Number of positive ratings of the video
170 dislike_count: Number of negative ratings of the video
171 average_rating: Average rating give by users, the scale used depends on the webpage
172 comment_count: Number of comments on the video
173 comments: A list of comments, each with one or more of the following
174 properties (all but one of text or html optional):
175 * "author" - human-readable name of the comment author
176 * "author_id" - user ID of the comment author
177 * "id" - Comment ID
178 * "html" - Comment as HTML
179 * "text" - Plain text of the comment
180 * "timestamp" - UNIX timestamp of comment
181 * "parent" - ID of the comment this one is replying to.
182 Set to "root" to indicate that this is a
183 comment to the original video.
184 age_limit: Age restriction for the video, as an integer (years)
185 webpage_url: The URL to the video webpage, if given to youtube-dl it
186 should allow to get the same result again. (It will be set
187 by YoutubeDL if it's missing)
188 categories: A list of categories that the video falls in, for example
189 ["Sports", "Berlin"]
190 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
191 is_live: True, False, or None (=unknown). Whether this video is a
192 live stream that goes on instead of a fixed-length video.
193 start_time: Time in seconds where the reproduction should start, as
194 specified in the URL.
195 end_time: Time in seconds where the reproduction should end, as
196 specified in the URL.
197
198 Unless mentioned otherwise, the fields should be Unicode strings.
199
200 Unless mentioned otherwise, None is equivalent to absence of information.
201
202
203 _type "playlist" indicates multiple videos.
204 There must be a key "entries", which is a list, an iterable, or a PagedList
205 object, each element of which is a valid dictionary by this specification.
206
207 Additionally, playlists can have "title", "description" and "id" attributes
208 with the same semantics as videos (see above).
209
210
211 _type "multi_video" indicates that there are multiple videos that
212 form a single show, for examples multiple acts of an opera or TV episode.
213 It must have an entries key like a playlist and contain all the keys
214 required for a video at the same time.
215
216
217 _type "url" indicates that the video must be extracted from another
218 location, possibly by a different extractor. Its only required key is:
219 "url" - the next URL to extract.
220 The key "ie_key" can be set to the class name (minus the trailing "IE",
221 e.g. "Youtube") if the extractor class is known in advance.
222 Additionally, the dictionary may have any properties of the resolved entity
223 known in advance, for example "title" if the title of the referred video is
224 known ahead of time.
225
226
227 _type "url_transparent" entities have the same specification as "url", but
228 indicate that the given additional information is more precise than the one
229 associated with the resolved URL.
230 This is useful when a site employs a video service that hosts the video and
231 its technical metadata, but that video service does not embed a useful
232 title, description etc.
233
234
235 Subclasses of this one should re-define the _real_initialize() and
236 _real_extract() methods and define a _VALID_URL regexp.
237 Probably, they should also be added to the list of extractors.
238
239 Finally, the _WORKING attribute should be set to False for broken IEs
240 in order to warn the users and skip the tests.
241 """
242
243 _ready = False
244 _downloader = None
245 _WORKING = True
246
247 def __init__(self, downloader=None):
248 """Constructor. Receives an optional downloader."""
249 self._ready = False
250 self.set_downloader(downloader)
251
252 @classmethod
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255
256 # This does not use has/getattr intentionally - we want to know whether
257 # we have cached the regexp for *this* class, whereas getattr would also
258 # match the superclass
259 if '_VALID_URL_RE' not in cls.__dict__:
260 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
261 return cls._VALID_URL_RE.match(url) is not None
262
263 @classmethod
264 def _match_id(cls, url):
265 if '_VALID_URL_RE' not in cls.__dict__:
266 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
267 m = cls._VALID_URL_RE.match(url)
268 assert m
269 return m.group('id')
270
271 @classmethod
272 def working(cls):
273 """Getter method for _WORKING."""
274 return cls._WORKING
275
276 def initialize(self):
277 """Initializes an instance (authentication, etc)."""
278 if not self._ready:
279 self._real_initialize()
280 self._ready = True
281
282 def extract(self, url):
283 """Extracts URL information and returns it in list of dicts."""
284 try:
285 self.initialize()
286 return self._real_extract(url)
287 except ExtractorError:
288 raise
289 except compat_http_client.IncompleteRead as e:
290 raise ExtractorError('A network error has occured.', cause=e, expected=True)
291 except (KeyError, StopIteration) as e:
292 raise ExtractorError('An extractor error has occured.', cause=e)
293
294 def set_downloader(self, downloader):
295 """Sets the downloader for this IE."""
296 self._downloader = downloader
297
298 def _real_initialize(self):
299 """Real initialization process. Redefine in subclasses."""
300 pass
301
302 def _real_extract(self, url):
303 """Real extraction process. Redefine in subclasses."""
304 pass
305
306 @classmethod
307 def ie_key(cls):
308 """A string for getting the InfoExtractor with get_info_extractor"""
309 return cls.__name__[:-2]
310
311 @property
312 def IE_NAME(self):
313 return type(self).__name__[:-2]
314
315 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
316 """ Returns the response handle """
317 if note is None:
318 self.report_download_webpage(video_id)
319 elif note is not False:
320 if video_id is None:
321 self.to_screen('%s' % (note,))
322 else:
323 self.to_screen('%s: %s' % (video_id, note))
324 try:
325 return self._downloader.urlopen(url_or_request)
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 if errnote is False:
328 return False
329 if errnote is None:
330 errnote = 'Unable to download webpage'
331 errmsg = '%s: %s' % (errnote, compat_str(err))
332 if fatal:
333 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
334 else:
335 self._downloader.report_warning(errmsg)
336 return False
337
338 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
339 """ Returns a tuple (page content as string, URL handle) """
340 # Strip hashes from the URL (#1038)
341 if isinstance(url_or_request, (compat_str, str)):
342 url_or_request = url_or_request.partition('#')[0]
343
344 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
345 if urlh is False:
346 assert not fatal
347 return False
348 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
349 return (content, urlh)
350
351 @staticmethod
352 def _guess_encoding_from_content(content_type, webpage_bytes):
353 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
354 if m:
355 encoding = m.group(1)
356 else:
357 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
358 webpage_bytes[:1024])
359 if m:
360 encoding = m.group(1).decode('ascii')
361 elif webpage_bytes.startswith(b'\xff\xfe'):
362 encoding = 'utf-16'
363 else:
364 encoding = 'utf-8'
365
366 return encoding
367
368 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
369 content_type = urlh.headers.get('Content-Type', '')
370 webpage_bytes = urlh.read()
371 if prefix is not None:
372 webpage_bytes = prefix + webpage_bytes
373 if not encoding:
374 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
375 if self._downloader.params.get('dump_intermediate_pages', False):
376 try:
377 url = url_or_request.get_full_url()
378 except AttributeError:
379 url = url_or_request
380 self.to_screen('Dumping request to ' + url)
381 dump = base64.b64encode(webpage_bytes).decode('ascii')
382 self._downloader.to_screen(dump)
383 if self._downloader.params.get('write_pages', False):
384 try:
385 url = url_or_request.get_full_url()
386 except AttributeError:
387 url = url_or_request
388 basen = '%s_%s' % (video_id, url)
389 if len(basen) > 240:
390 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
391 basen = basen[:240 - len(h)] + h
392 raw_filename = basen + '.dump'
393 filename = sanitize_filename(raw_filename, restricted=True)
394 self.to_screen('Saving request to ' + filename)
395 # Working around MAX_PATH limitation on Windows (see
396 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
397 if os.name == 'nt':
398 absfilepath = os.path.abspath(filename)
399 if len(absfilepath) > 259:
400 filename = '\\\\?\\' + absfilepath
401 with open(filename, 'wb') as outf:
402 outf.write(webpage_bytes)
403
404 try:
405 content = webpage_bytes.decode(encoding, 'replace')
406 except LookupError:
407 content = webpage_bytes.decode('utf-8', 'replace')
408
409 if ('<title>Access to this site is blocked</title>' in content and
410 'Websense' in content[:512]):
411 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
412 blocked_iframe = self._html_search_regex(
413 r'<iframe src="([^"]+)"', content,
414 'Websense information URL', default=None)
415 if blocked_iframe:
416 msg += ' Visit %s for more details' % blocked_iframe
417 raise ExtractorError(msg, expected=True)
418 if '<title>The URL you requested has been blocked</title>' in content[:512]:
419 msg = (
420 'Access to this webpage has been blocked by Indian censorship. '
421 'Use a VPN or proxy server (with --proxy) to route around it.')
422 block_msg = self._html_search_regex(
423 r'</h1><p>(.*?)</p>',
424 content, 'block message', default=None)
425 if block_msg:
426 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
427 raise ExtractorError(msg, expected=True)
428
429 return content
430
431 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
432 """ Returns the data of the page as a string """
433 success = False
434 try_count = 0
435 while success is False:
436 try:
437 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
438 success = True
439 except compat_http_client.IncompleteRead as e:
440 try_count += 1
441 if try_count >= tries:
442 raise e
443 self._sleep(timeout, video_id)
444 if res is False:
445 return res
446 else:
447 content, _ = res
448 return content
449
450 def _download_xml(self, url_or_request, video_id,
451 note='Downloading XML', errnote='Unable to download XML',
452 transform_source=None, fatal=True, encoding=None):
453 """Return the xml as an xml.etree.ElementTree.Element"""
454 xml_string = self._download_webpage(
455 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
456 if xml_string is False:
457 return xml_string
458 if transform_source:
459 xml_string = transform_source(xml_string)
460 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
461
462 def _download_json(self, url_or_request, video_id,
463 note='Downloading JSON metadata',
464 errnote='Unable to download JSON metadata',
465 transform_source=None,
466 fatal=True, encoding=None):
467 json_string = self._download_webpage(
468 url_or_request, video_id, note, errnote, fatal=fatal,
469 encoding=encoding)
470 if (not fatal) and json_string is False:
471 return None
472 return self._parse_json(
473 json_string, video_id, transform_source=transform_source, fatal=fatal)
474
475 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
476 if transform_source:
477 json_string = transform_source(json_string)
478 try:
479 return json.loads(json_string)
480 except ValueError as ve:
481 errmsg = '%s: Failed to parse JSON ' % video_id
482 if fatal:
483 raise ExtractorError(errmsg, cause=ve)
484 else:
485 self.report_warning(errmsg + str(ve))
486
487 def report_warning(self, msg, video_id=None):
488 idstr = '' if video_id is None else '%s: ' % video_id
489 self._downloader.report_warning(
490 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
491
492 def to_screen(self, msg):
493 """Print msg to screen, prefixing it with '[ie_name]'"""
494 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
495
496 def report_extraction(self, id_or_name):
497 """Report information extraction."""
498 self.to_screen('%s: Extracting information' % id_or_name)
499
500 def report_download_webpage(self, video_id):
501 """Report webpage download."""
502 self.to_screen('%s: Downloading webpage' % video_id)
503
504 def report_age_confirmation(self):
505 """Report attempt to confirm age."""
506 self.to_screen('Confirming age')
507
508 def report_login(self):
509 """Report attempt to log in."""
510 self.to_screen('Logging in')
511
512 # Methods for following #608
513 @staticmethod
514 def url_result(url, ie=None, video_id=None, video_title=None):
515 """Returns a URL that points to a page that should be processed"""
516 # TODO: ie should be the class used for getting the info
517 video_info = {'_type': 'url',
518 'url': url,
519 'ie_key': ie}
520 if video_id is not None:
521 video_info['id'] = video_id
522 if video_title is not None:
523 video_info['title'] = video_title
524 return video_info
525
526 @staticmethod
527 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
528 """Returns a playlist"""
529 video_info = {'_type': 'playlist',
530 'entries': entries}
531 if playlist_id:
532 video_info['id'] = playlist_id
533 if playlist_title:
534 video_info['title'] = playlist_title
535 if playlist_description:
536 video_info['description'] = playlist_description
537 return video_info
538
539 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
540 """
541 Perform a regex search on the given string, using a single or a list of
542 patterns returning the first matching group.
543 In case of failure return a default value or raise a WARNING or a
544 RegexNotFoundError, depending on fatal, specifying the field name.
545 """
546 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
547 mobj = re.search(pattern, string, flags)
548 else:
549 for p in pattern:
550 mobj = re.search(p, string, flags)
551 if mobj:
552 break
553
554 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
555 _name = '\033[0;34m%s\033[0m' % name
556 else:
557 _name = name
558
559 if mobj:
560 if group is None:
561 # return the first matching group
562 return next(g for g in mobj.groups() if g is not None)
563 else:
564 return mobj.group(group)
565 elif default is not NO_DEFAULT:
566 return default
567 elif fatal:
568 raise RegexNotFoundError('Unable to extract %s' % _name)
569 else:
570 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
571 return None
572
573 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
574 """
575 Like _search_regex, but strips HTML tags and unescapes entities.
576 """
577 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
578 if res:
579 return clean_html(res).strip()
580 else:
581 return res
582
583 def _get_login_info(self):
584 """
585 Get the login info as (username, password)
586 It will look in the netrc file using the _NETRC_MACHINE value
587 If there's no info available, return (None, None)
588 """
589 if self._downloader is None:
590 return (None, None)
591
592 username = None
593 password = None
594 downloader_params = self._downloader.params
595
596 # Attempt to use provided username and password or .netrc data
597 if downloader_params.get('username', None) is not None:
598 username = downloader_params['username']
599 password = downloader_params['password']
600 elif downloader_params.get('usenetrc', False):
601 try:
602 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
603 if info is not None:
604 username = info[0]
605 password = info[2]
606 else:
607 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
608 except (IOError, netrc.NetrcParseError) as err:
609 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
610
611 return (username, password)
612
613 def _get_tfa_info(self):
614 """
615 Get the two-factor authentication info
616 TODO - asking the user will be required for sms/phone verify
617 currently just uses the command line option
618 If there's no info available, return None
619 """
620 if self._downloader is None:
621 return None
622 downloader_params = self._downloader.params
623
624 if downloader_params.get('twofactor', None) is not None:
625 return downloader_params['twofactor']
626
627 return None
628
629 # Helper functions for extracting OpenGraph info
630 @staticmethod
631 def _og_regexes(prop):
632 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
633 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
634 template = r'<meta[^>]+?%s[^>]+?%s'
635 return [
636 template % (property_re, content_re),
637 template % (content_re, property_re),
638 ]
639
640 @staticmethod
641 def _meta_regex(prop):
642 return r'''(?isx)<meta
643 (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
644 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
645
646 def _og_search_property(self, prop, html, name=None, **kargs):
647 if name is None:
648 name = 'OpenGraph %s' % prop
649 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
650 if escaped is None:
651 return None
652 return unescapeHTML(escaped)
653
654 def _og_search_thumbnail(self, html, **kargs):
655 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
656
657 def _og_search_description(self, html, **kargs):
658 return self._og_search_property('description', html, fatal=False, **kargs)
659
660 def _og_search_title(self, html, **kargs):
661 return self._og_search_property('title', html, **kargs)
662
663 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
664 regexes = self._og_regexes('video') + self._og_regexes('video:url')
665 if secure:
666 regexes = self._og_regexes('video:secure_url') + regexes
667 return self._html_search_regex(regexes, html, name, **kargs)
668
669 def _og_search_url(self, html, **kargs):
670 return self._og_search_property('url', html, **kargs)
671
672 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
673 if display_name is None:
674 display_name = name
675 return self._html_search_regex(
676 self._meta_regex(name),
677 html, display_name, fatal=fatal, group='content', **kwargs)
678
679 def _dc_search_uploader(self, html):
680 return self._html_search_meta('dc.creator', html, 'uploader')
681
682 def _rta_search(self, html):
683 # See http://www.rtalabel.org/index.php?content=howtofaq#single
684 if re.search(r'(?ix)<meta\s+name="rating"\s+'
685 r' content="RTA-5042-1996-1400-1577-RTA"',
686 html):
687 return 18
688 return 0
689
690 def _media_rating_search(self, html):
691 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
692 rating = self._html_search_meta('rating', html)
693
694 if not rating:
695 return None
696
697 RATING_TABLE = {
698 'safe for kids': 0,
699 'general': 8,
700 '14 years': 14,
701 'mature': 17,
702 'restricted': 19,
703 }
704 return RATING_TABLE.get(rating.lower(), None)
705
706 def _family_friendly_search(self, html):
707 # See http://schema.org/VideoObject
708 family_friendly = self._html_search_meta('isFamilyFriendly', html)
709
710 if not family_friendly:
711 return None
712
713 RATING_TABLE = {
714 '1': 0,
715 'true': 0,
716 '0': 18,
717 'false': 18,
718 }
719 return RATING_TABLE.get(family_friendly.lower(), None)
720
721 def _twitter_search_player(self, html):
722 return self._html_search_meta('twitter:player', html,
723 'twitter card player')
724
725 @staticmethod
726 def _hidden_inputs(html):
727 return dict([
728 (input.group('name'), input.group('value')) for input in re.finditer(
729 r'''(?x)
730 <input\s+
731 type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
732 name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
733 (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
734 value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
735 ''', html)
736 ])
737
738 def _form_hidden_inputs(self, form_id, html):
739 form = self._search_regex(
740 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
741 html, '%s form' % form_id, group='form')
742 return self._hidden_inputs(form)
743
744 def _sort_formats(self, formats, field_preference=None):
745 if not formats:
746 raise ExtractorError('No video formats found')
747
748 def _formats_key(f):
749 # TODO remove the following workaround
750 from ..utils import determine_ext
751 if not f.get('ext') and 'url' in f:
752 f['ext'] = determine_ext(f['url'])
753
754 if isinstance(field_preference, (list, tuple)):
755 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
756
757 preference = f.get('preference')
758 if preference is None:
759 proto = f.get('protocol')
760 if proto is None:
761 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
762
763 preference = 0 if proto in ['http', 'https'] else -0.1
764 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
765 preference -= 0.5
766
767 if f.get('vcodec') == 'none': # audio only
768 if self._downloader.params.get('prefer_free_formats'):
769 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
770 else:
771 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
772 ext_preference = 0
773 try:
774 audio_ext_preference = ORDER.index(f['ext'])
775 except ValueError:
776 audio_ext_preference = -1
777 else:
778 if self._downloader.params.get('prefer_free_formats'):
779 ORDER = ['flv', 'mp4', 'webm']
780 else:
781 ORDER = ['webm', 'flv', 'mp4']
782 try:
783 ext_preference = ORDER.index(f['ext'])
784 except ValueError:
785 ext_preference = -1
786 audio_ext_preference = 0
787
788 return (
789 preference,
790 f.get('language_preference') if f.get('language_preference') is not None else -1,
791 f.get('quality') if f.get('quality') is not None else -1,
792 f.get('tbr') if f.get('tbr') is not None else -1,
793 f.get('filesize') if f.get('filesize') is not None else -1,
794 f.get('vbr') if f.get('vbr') is not None else -1,
795 f.get('height') if f.get('height') is not None else -1,
796 f.get('width') if f.get('width') is not None else -1,
797 ext_preference,
798 f.get('abr') if f.get('abr') is not None else -1,
799 audio_ext_preference,
800 f.get('fps') if f.get('fps') is not None else -1,
801 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
802 f.get('source_preference') if f.get('source_preference') is not None else -1,
803 f.get('format_id') if f.get('format_id') is not None else '',
804 )
805 formats.sort(key=_formats_key)
806
807 def _check_formats(self, formats, video_id):
808 if formats:
809 formats[:] = filter(
810 lambda f: self._is_valid_url(
811 f['url'], video_id,
812 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
813 formats)
814
815 def _is_valid_url(self, url, video_id, item='video'):
816 url = self._proto_relative_url(url, scheme='http:')
817 # For now assume non HTTP(S) URLs always valid
818 if not (url.startswith('http://') or url.startswith('https://')):
819 return True
820 try:
821 self._request_webpage(url, video_id, 'Checking %s URL' % item)
822 return True
823 except ExtractorError as e:
824 if isinstance(e.cause, compat_HTTPError):
825 self.to_screen(
826 '%s: %s URL is invalid, skipping' % (video_id, item))
827 return False
828 raise
829
830 def http_scheme(self):
831 """ Either "http:" or "https:", depending on the user's preferences """
832 return (
833 'http:'
834 if self._downloader.params.get('prefer_insecure', False)
835 else 'https:')
836
837 def _proto_relative_url(self, url, scheme=None):
838 if url is None:
839 return url
840 if url.startswith('//'):
841 if scheme is None:
842 scheme = self.http_scheme()
843 return scheme + url
844 else:
845 return url
846
847 def _sleep(self, timeout, video_id, msg_template=None):
848 if msg_template is None:
849 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
850 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
851 self.to_screen(msg)
852 time.sleep(timeout)
853
854 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
855 transform_source=lambda s: fix_xml_ampersands(s).strip()):
856 manifest = self._download_xml(
857 manifest_url, video_id, 'Downloading f4m manifest',
858 'Unable to download f4m manifest',
859 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
860 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
861 transform_source=transform_source)
862
863 formats = []
864 manifest_version = '1.0'
865 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
866 if not media_nodes:
867 manifest_version = '2.0'
868 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
869 for i, media_el in enumerate(media_nodes):
870 if manifest_version == '2.0':
871 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
872 if not media_url:
873 continue
874 manifest_url = (
875 media_url if media_url.startswith('http://') or media_url.startswith('https://')
876 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
877 # If media_url is itself a f4m manifest do the recursive extraction
878 # since bitrates in parent manifest (this one) and media_url manifest
879 # may differ leading to inability to resolve the format by requested
880 # bitrate in f4m downloader
881 if determine_ext(manifest_url) == 'f4m':
882 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
883 continue
884 tbr = int_or_none(media_el.attrib.get('bitrate'))
885 formats.append({
886 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
887 'url': manifest_url,
888 'ext': 'flv',
889 'tbr': tbr,
890 'width': int_or_none(media_el.attrib.get('width')),
891 'height': int_or_none(media_el.attrib.get('height')),
892 'preference': preference,
893 })
894 self._sort_formats(formats)
895
896 return formats
897
898 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
899 entry_protocol='m3u8', preference=None,
900 m3u8_id=None, note=None, errnote=None,
901 fatal=True):
902
903 formats = [{
904 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
905 'url': m3u8_url,
906 'ext': ext,
907 'protocol': 'm3u8',
908 'preference': preference - 1 if preference else -1,
909 'resolution': 'multiple',
910 'format_note': 'Quality selection URL',
911 }]
912
913 format_url = lambda u: (
914 u
915 if re.match(r'^https?://', u)
916 else compat_urlparse.urljoin(m3u8_url, u))
917
918 m3u8_doc = self._download_webpage(
919 m3u8_url, video_id,
920 note=note or 'Downloading m3u8 information',
921 errnote=errnote or 'Failed to download m3u8 information',
922 fatal=fatal)
923 if m3u8_doc is False:
924 return m3u8_doc
925 last_info = None
926 last_media = None
927 kv_rex = re.compile(
928 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
929 for line in m3u8_doc.splitlines():
930 if line.startswith('#EXT-X-STREAM-INF:'):
931 last_info = {}
932 for m in kv_rex.finditer(line):
933 v = m.group('val')
934 if v.startswith('"'):
935 v = v[1:-1]
936 last_info[m.group('key')] = v
937 elif line.startswith('#EXT-X-MEDIA:'):
938 last_media = {}
939 for m in kv_rex.finditer(line):
940 v = m.group('val')
941 if v.startswith('"'):
942 v = v[1:-1]
943 last_media[m.group('key')] = v
944 elif line.startswith('#') or not line.strip():
945 continue
946 else:
947 if last_info is None:
948 formats.append({'url': format_url(line)})
949 continue
950 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
951 format_id = []
952 if m3u8_id:
953 format_id.append(m3u8_id)
954 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
955 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
956 f = {
957 'format_id': '-'.join(format_id),
958 'url': format_url(line.strip()),
959 'tbr': tbr,
960 'ext': ext,
961 'protocol': entry_protocol,
962 'preference': preference,
963 }
964 codecs = last_info.get('CODECS')
965 if codecs:
966 # TODO: looks like video codec is not always necessarily goes first
967 va_codecs = codecs.split(',')
968 if va_codecs[0]:
969 f['vcodec'] = va_codecs[0].partition('.')[0]
970 if len(va_codecs) > 1 and va_codecs[1]:
971 f['acodec'] = va_codecs[1].partition('.')[0]
972 resolution = last_info.get('RESOLUTION')
973 if resolution:
974 width_str, height_str = resolution.split('x')
975 f['width'] = int(width_str)
976 f['height'] = int(height_str)
977 if last_media is not None:
978 f['m3u8_media'] = last_media
979 last_media = None
980 formats.append(f)
981 last_info = {}
982 self._sort_formats(formats)
983 return formats
984
985 @staticmethod
986 def _xpath_ns(path, namespace=None):
987 if not namespace:
988 return path
989 out = []
990 for c in path.split('/'):
991 if not c or c == '.':
992 out.append(c)
993 else:
994 out.append('{%s}%s' % (namespace, c))
995 return '/'.join(out)
996
997 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
998 smil = self._download_smil(smil_url, video_id, fatal=fatal)
999
1000 if smil is False:
1001 assert not fatal
1002 return []
1003
1004 namespace = self._parse_smil_namespace(smil)
1005
1006 return self._parse_smil_formats(
1007 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1008
1009 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1010 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1011 if smil is False:
1012 return {}
1013 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1014
1015 def _download_smil(self, smil_url, video_id, fatal=True):
1016 return self._download_xml(
1017 smil_url, video_id, 'Downloading SMIL file',
1018 'Unable to download SMIL file', fatal=fatal)
1019
1020 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1021 namespace = self._parse_smil_namespace(smil)
1022
1023 formats = self._parse_smil_formats(
1024 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1025 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1026
1027 video_id = os.path.splitext(url_basename(smil_url))[0]
1028 title = None
1029 description = None
1030 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1031 name = meta.attrib.get('name')
1032 content = meta.attrib.get('content')
1033 if not name or not content:
1034 continue
1035 if not title and name == 'title':
1036 title = content
1037 elif not description and name in ('description', 'abstract'):
1038 description = content
1039
1040 return {
1041 'id': video_id,
1042 'title': title or video_id,
1043 'description': description,
1044 'formats': formats,
1045 'subtitles': subtitles,
1046 }
1047
1048 def _parse_smil_namespace(self, smil):
1049 return self._search_regex(
1050 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1051
1052 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1053 base = smil_url
1054 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1055 b = meta.get('base') or meta.get('httpBase')
1056 if b:
1057 base = b
1058 break
1059
1060 formats = []
1061 rtmp_count = 0
1062 http_count = 0
1063
1064 videos = smil.findall(self._xpath_ns('.//video', namespace))
1065 for video in videos:
1066 src = video.get('src')
1067 if not src:
1068 continue
1069
1070 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1071 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1072 width = int_or_none(video.get('width'))
1073 height = int_or_none(video.get('height'))
1074 proto = video.get('proto')
1075 ext = video.get('ext')
1076 src_ext = determine_ext(src)
1077 streamer = video.get('streamer') or base
1078
1079 if proto == 'rtmp' or streamer.startswith('rtmp'):
1080 rtmp_count += 1
1081 formats.append({
1082 'url': streamer,
1083 'play_path': src,
1084 'ext': 'flv',
1085 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1086 'tbr': bitrate,
1087 'filesize': filesize,
1088 'width': width,
1089 'height': height,
1090 })
1091 continue
1092
1093 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1094
1095 if proto == 'm3u8' or src_ext == 'm3u8':
1096 formats.extend(self._extract_m3u8_formats(
1097 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1098 continue
1099
1100 if src_ext == 'f4m':
1101 f4m_url = src_url
1102 if not f4m_params:
1103 f4m_params = {
1104 'hdcore': '3.2.0',
1105 'plugin': 'flowplayer-3.2.0.1',
1106 }
1107 f4m_url += '&' if '?' in f4m_url else '?'
1108 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1109 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1110 continue
1111
1112 if src_url.startswith('http'):
1113 http_count += 1
1114 formats.append({
1115 'url': src_url,
1116 'ext': ext or src_ext or 'flv',
1117 'format_id': 'http-%d' % (bitrate or http_count),
1118 'tbr': bitrate,
1119 'filesize': filesize,
1120 'width': width,
1121 'height': height,
1122 })
1123 continue
1124
1125 self._sort_formats(formats)
1126
1127 return formats
1128
1129 def _parse_smil_subtitles(self, smil, namespace=None):
1130 subtitles = {}
1131 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1132 src = textstream.get('src')
1133 if not src:
1134 continue
1135 ext = textstream.get('ext') or determine_ext(src)
1136 if not ext:
1137 type_ = textstream.get('type')
1138 if type_ == 'text/srt':
1139 ext = 'srt'
1140 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1141 subtitles.setdefault(lang, []).append({
1142 'url': src,
1143 'ext': ext,
1144 })
1145 return subtitles
1146
1147 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1148 xspf = self._download_xml(
1149 playlist_url, playlist_id, 'Downloading xpsf playlist',
1150 'Unable to download xspf manifest', fatal=fatal)
1151 if xspf is False:
1152 return []
1153 return self._parse_xspf(xspf, playlist_id)
1154
1155 def _parse_xspf(self, playlist, playlist_id):
1156 NS_MAP = {
1157 'xspf': 'http://xspf.org/ns/0/',
1158 's1': 'http://static.streamone.nl/player/ns/0',
1159 }
1160
1161 entries = []
1162 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1163 title = xpath_text(
1164 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1165 description = xpath_text(
1166 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1167 thumbnail = xpath_text(
1168 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1169 duration = float_or_none(
1170 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1171
1172 formats = [{
1173 'url': location.text,
1174 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1175 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1176 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1177 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1178 self._sort_formats(formats)
1179
1180 entries.append({
1181 'id': playlist_id,
1182 'title': title,
1183 'description': description,
1184 'thumbnail': thumbnail,
1185 'duration': duration,
1186 'formats': formats,
1187 })
1188 return entries
1189
1190 def _live_title(self, name):
1191 """ Generate the title for a live video """
1192 now = datetime.datetime.now()
1193 now_str = now.strftime("%Y-%m-%d %H:%M")
1194 return name + ' ' + now_str
1195
1196 def _int(self, v, name, fatal=False, **kwargs):
1197 res = int_or_none(v, **kwargs)
1198 if 'get_attr' in kwargs:
1199 print(getattr(v, kwargs['get_attr']))
1200 if res is None:
1201 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1202 if fatal:
1203 raise ExtractorError(msg)
1204 else:
1205 self._downloader.report_warning(msg)
1206 return res
1207
1208 def _float(self, v, name, fatal=False, **kwargs):
1209 res = float_or_none(v, **kwargs)
1210 if res is None:
1211 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1212 if fatal:
1213 raise ExtractorError(msg)
1214 else:
1215 self._downloader.report_warning(msg)
1216 return res
1217
1218 def _set_cookie(self, domain, name, value, expire_time=None):
1219 cookie = compat_cookiejar.Cookie(
1220 0, name, value, None, None, domain, None,
1221 None, '/', True, False, expire_time, '', None, None, None)
1222 self._downloader.cookiejar.set_cookie(cookie)
1223
1224 def _get_cookies(self, url):
1225 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1226 req = compat_urllib_request.Request(url)
1227 self._downloader.cookiejar.add_cookie_header(req)
1228 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1229
1230 def get_testcases(self, include_onlymatching=False):
1231 t = getattr(self, '_TEST', None)
1232 if t:
1233 assert not hasattr(self, '_TESTS'), \
1234 '%s has _TEST and _TESTS' % type(self).__name__
1235 tests = [t]
1236 else:
1237 tests = getattr(self, '_TESTS', [])
1238 for t in tests:
1239 if not include_onlymatching and t.get('only_matching', False):
1240 continue
1241 t['name'] = type(self).__name__[:-len('IE')]
1242 yield t
1243
1244 def is_suitable(self, age_limit):
1245 """ Test whether the extractor is generally suitable for the given
1246 age limit (i.e. pornographic sites are not, all others usually are) """
1247
1248 any_restricted = False
1249 for tc in self.get_testcases(include_onlymatching=False):
1250 if 'playlist' in tc:
1251 tc = tc['playlist'][0]
1252 is_restricted = age_restricted(
1253 tc.get('info_dict', {}).get('age_limit'), age_limit)
1254 if not is_restricted:
1255 return True
1256 any_restricted = any_restricted or is_restricted
1257 return not any_restricted
1258
1259 def extract_subtitles(self, *args, **kwargs):
1260 if (self._downloader.params.get('writesubtitles', False) or
1261 self._downloader.params.get('listsubtitles')):
1262 return self._get_subtitles(*args, **kwargs)
1263 return {}
1264
1265 def _get_subtitles(self, *args, **kwargs):
1266 raise NotImplementedError("This method must be implemented by subclasses")
1267
1268 def extract_automatic_captions(self, *args, **kwargs):
1269 if (self._downloader.params.get('writeautomaticsub', False) or
1270 self._downloader.params.get('listsubtitles')):
1271 return self._get_automatic_captions(*args, **kwargs)
1272 return {}
1273
1274 def _get_automatic_captions(self, *args, **kwargs):
1275 raise NotImplementedError("This method must be implemented by subclasses")
1276
1277
1278 class SearchInfoExtractor(InfoExtractor):
1279 """
1280 Base class for paged search queries extractors.
1281 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1282 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1283 """
1284
1285 @classmethod
1286 def _make_valid_url(cls):
1287 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1288
1289 @classmethod
1290 def suitable(cls, url):
1291 return re.match(cls._make_valid_url(), url) is not None
1292
1293 def _real_extract(self, query):
1294 mobj = re.match(self._make_valid_url(), query)
1295 if mobj is None:
1296 raise ExtractorError('Invalid search query "%s"' % query)
1297
1298 prefix = mobj.group('prefix')
1299 query = mobj.group('query')
1300 if prefix == '':
1301 return self._get_n_results(query, 1)
1302 elif prefix == 'all':
1303 return self._get_n_results(query, self._MAX_RESULTS)
1304 else:
1305 n = int(prefix)
1306 if n <= 0:
1307 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1308 elif n > self._MAX_RESULTS:
1309 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1310 n = self._MAX_RESULTS
1311 return self._get_n_results(query, n)
1312
1313 def _get_n_results(self, query, n):
1314 """Get a specified number of results for a query"""
1315 raise NotImplementedError("This method must be implemented by subclasses")
1316
1317 @property
1318 def SEARCH_KEY(self):
1319 return self._SEARCH_KEY