]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Merge remote-tracking branch 'lenaten/8tracks'
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
d6983cb4
PH
17 compat_http_client,
18 compat_urllib_error,
c7deaa4c 19 compat_urllib_parse_urlparse,
f0b5d6af 20 compat_urlparse,
d6983cb4 21 compat_str,
8c25f81b
PH
22)
23from ..utils import (
05900629 24 age_restricted,
d6983cb4
PH
25 clean_html,
26 compiled_regex_type,
27 ExtractorError,
b14f3a4c 28 float_or_none,
31bb8d3f 29 int_or_none,
55b3e45b 30 RegexNotFoundError,
d41e6efc 31 sanitize_filename,
f38de77f 32 unescapeHTML,
d6983cb4 33)
46374a56 34_NO_DEFAULT = object()
d6983cb4 35
dca08720 36
d6983cb4
PH
37class InfoExtractor(object):
38 """Information Extractor class.
39
40 Information extractors are the classes that, given a URL, extract
41 information about the video (or videos) the URL refers to. This
42 information includes the real video URL, the video title, author and
43 others. The information is stored in a dictionary which is then
5d380852 44 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
45 information possibly downloading the video to the file system, among
46 other possible outcomes.
47
fed5d032
PH
48 The type field determines the the type of the result.
49 By far the most common value (and the default if _type is missing) is
50 "video", which indicates a single video.
51
52 For a video, the dictionaries must include the following fields:
d6983cb4
PH
53
54 id: Video identifier.
d6983cb4 55 title: Video title, unescaped.
d67b0b15 56
f49d89ee 57 Additionally, it must contain either a formats entry or a url one:
d67b0b15 58
f49d89ee
PH
59 formats: A list of dictionaries for each format available, ordered
60 from worst to best quality.
61
62 Potential fields:
d67b0b15
PH
63 * url Mandatory. The URL of the video file
64 * ext Will be calculated from url if missing
65 * format A human-readable description of the format
66 ("mp4 container with h264/opus").
67 Calculated from the format_id, width, height.
68 and format_note fields if missing.
69 * format_id A short description of the format
5d4f3985
PH
70 ("mp4_h264_opus" or "19").
71 Technically optional, but strongly recommended.
d67b0b15
PH
72 * format_note Additional info about the format
73 ("3D" or "DASH video")
74 * width Width of the video, if known
75 * height Height of the video, if known
f49d89ee 76 * resolution Textual description of width and height
7217e148 77 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
78 * abr Average audio bitrate in KBit/s
79 * acodec Name of the audio codec in use
dd27fd17 80 * asr Audio sampling rate in Hertz
d67b0b15 81 * vbr Average video bitrate in KBit/s
fbb21cf5 82 * fps Frame rate
d67b0b15 83 * vcodec Name of the video codec in use
1394ce65 84 * container Name of the container format
d67b0b15 85 * filesize The number of bytes, if known in advance
9732d77e 86 * filesize_approx An estimate for the number of bytes
d67b0b15 87 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
88 * protocol The protocol that will be used for the actual
89 download, lower-case.
db1f3888 90 "http", "https", "rtsp", "rtmp", "m3u8" or so.
f49d89ee 91 * preference Order number of this format. If this field is
08d13955 92 present and not None, the formats get sorted
38d63d84 93 by this field, regardless of all other values.
f49d89ee
PH
94 -1 for default (order by other properties),
95 -2 or smaller for less than default.
e65566a9
PH
96 < -1000 to hide the format (if there is
97 another one which is strictly better)
aff2f4f4
PH
98 * language_preference Is this in the correct requested
99 language?
100 10 if it's what the URL is about,
101 -1 for default (don't know),
102 -10 otherwise, other values reserved for now.
5d73273f
PH
103 * quality Order number of the video quality of this
104 format, irrespective of the file format.
105 -1 for default (order by other properties),
106 -2 or smaller for less than default.
c64ed2a3
PH
107 * source_preference Order number for this video source
108 (quality takes higher priority)
109 -1 for default (order by other properties),
110 -2 or smaller for less than default.
d769be6c
PH
111 * http_referer HTTP Referer header value to set.
112 * http_method HTTP method to use for the download.
113 * http_headers A dictionary of additional HTTP headers
114 to add to the request.
115 * http_post_data Additional data to send with a POST
116 request.
6271f1ca
PH
117 * stretched_ratio If given and not 1, indicates that the
118 video's pixels are not square.
119 width : height ratio as float.
c0ba0f48 120 url: Final video URL.
d6983cb4 121 ext: Video filename extension.
d67b0b15
PH
122 format: The video format, defaults to ext (used for --get-format)
123 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 124
d6983cb4
PH
125 The following fields are optional:
126
f5e43bc6 127 alt_title: A secondary title of the video.
0afef30b
PH
128 display_id An alternative identifier for the video, not necessarily
129 unique, but available before title. Typically, id is
130 something like "4234987", title "Dancing naked mole rats",
131 and display_id "dancing-naked-mole-rats"
d5519808
PH
132 thumbnails: A list of dictionaries, with the following entries:
133 * "url"
134 * "width" (optional, int)
135 * "height" (optional, int)
136 * "resolution" (optional, string "{width}x{height"},
137 deprecated)
d6983cb4 138 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 139 description: Full video description.
d6983cb4 140 uploader: Full name of the video uploader.
955c4514 141 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 142 upload_date: Video upload date (YYYYMMDD).
955c4514 143 If not explicitly set, calculated from timestamp.
d6983cb4 144 uploader_id: Nickname or id of the video uploader.
da9ec3b9 145 location: Physical location where the video was filmed.
5d51a883
JMF
146 subtitles: The subtitle file contents as a dictionary in the format
147 {language: subtitles}.
c0ba0f48 148 duration: Length of the video in seconds, as an integer.
f3d29461 149 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
150 like_count: Number of positive ratings of the video
151 dislike_count: Number of negative ratings of the video
152 comment_count: Number of comments on the video
dd622d7c
PH
153 comments: A list of comments, each with one or more of the following
154 properties (all but one of text or html optional):
155 * "author" - human-readable name of the comment author
156 * "author_id" - user ID of the comment author
157 * "id" - Comment ID
158 * "html" - Comment as HTML
159 * "text" - Plain text of the comment
160 * "timestamp" - UNIX timestamp of comment
161 * "parent" - ID of the comment this one is replying to.
162 Set to "root" to indicate that this is a
163 comment to the original video.
8dbe9899 164 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
165 webpage_url: The url to the video webpage, if given to youtube-dl it
166 should allow to get the same result again. (It will be set
167 by YoutubeDL if it's missing)
ad3bc6ac
PH
168 categories: A list of categories that the video falls in, for example
169 ["Sports", "Berlin"]
7267bd53
PH
170 is_live: True, False, or None (=unknown). Whether this video is a
171 live stream that goes on instead of a fixed-length video.
d6983cb4 172
deefc05b 173 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 174
d838b1bd
PH
175 Unless mentioned otherwise, None is equivalent to absence of information.
176
fed5d032
PH
177
178 _type "playlist" indicates multiple videos.
b82f815f
PH
179 There must be a key "entries", which is a list, an iterable, or a PagedList
180 object, each element of which is a valid dictionary by this specification.
fed5d032
PH
181
182 Additionally, playlists can have "title" and "id" attributes with the same
183 semantics as videos (see above).
184
185
186 _type "multi_video" indicates that there are multiple videos that
187 form a single show, for examples multiple acts of an opera or TV episode.
188 It must have an entries key like a playlist and contain all the keys
189 required for a video at the same time.
190
191
192 _type "url" indicates that the video must be extracted from another
193 location, possibly by a different extractor. Its only required key is:
194 "url" - the next URL to extract.
f58766ce
PH
195 The key "ie_key" can be set to the class name (minus the trailing "IE",
196 e.g. "Youtube") if the extractor class is known in advance.
197 Additionally, the dictionary may have any properties of the resolved entity
198 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
199 known ahead of time.
200
201
202 _type "url_transparent" entities have the same specification as "url", but
203 indicate that the given additional information is more precise than the one
204 associated with the resolved URL.
205 This is useful when a site employs a video service that hosts the video and
206 its technical metadata, but that video service does not embed a useful
207 title, description etc.
208
209
d6983cb4
PH
210 Subclasses of this one should re-define the _real_initialize() and
211 _real_extract() methods and define a _VALID_URL regexp.
212 Probably, they should also be added to the list of extractors.
213
d6983cb4
PH
214 Finally, the _WORKING attribute should be set to False for broken IEs
215 in order to warn the users and skip the tests.
216 """
217
218 _ready = False
219 _downloader = None
220 _WORKING = True
221
222 def __init__(self, downloader=None):
223 """Constructor. Receives an optional downloader."""
224 self._ready = False
225 self.set_downloader(downloader)
226
227 @classmethod
228 def suitable(cls, url):
229 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
230
231 # This does not use has/getattr intentionally - we want to know whether
232 # we have cached the regexp for *this* class, whereas getattr would also
233 # match the superclass
234 if '_VALID_URL_RE' not in cls.__dict__:
235 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
236 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 237
ed9266db
PH
238 @classmethod
239 def _match_id(cls, url):
240 if '_VALID_URL_RE' not in cls.__dict__:
241 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
242 m = cls._VALID_URL_RE.match(url)
243 assert m
244 return m.group('id')
245
d6983cb4
PH
246 @classmethod
247 def working(cls):
248 """Getter method for _WORKING."""
249 return cls._WORKING
250
251 def initialize(self):
252 """Initializes an instance (authentication, etc)."""
253 if not self._ready:
254 self._real_initialize()
255 self._ready = True
256
257 def extract(self, url):
258 """Extracts URL information and returns it in list of dicts."""
259 self.initialize()
260 return self._real_extract(url)
261
262 def set_downloader(self, downloader):
263 """Sets the downloader for this IE."""
264 self._downloader = downloader
265
266 def _real_initialize(self):
267 """Real initialization process. Redefine in subclasses."""
268 pass
269
270 def _real_extract(self, url):
271 """Real extraction process. Redefine in subclasses."""
272 pass
273
56c73665
JMF
274 @classmethod
275 def ie_key(cls):
276 """A string for getting the InfoExtractor with get_info_extractor"""
277 return cls.__name__[:-2]
278
d6983cb4
PH
279 @property
280 def IE_NAME(self):
281 return type(self).__name__[:-2]
282
7cc3570e 283 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
284 """ Returns the response handle """
285 if note is None:
286 self.report_download_webpage(video_id)
287 elif note is not False:
7cc3570e 288 if video_id is None:
f1a9d64e 289 self.to_screen('%s' % (note,))
7cc3570e 290 else:
f1a9d64e 291 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 292 try:
dca08720 293 return self._downloader.urlopen(url_or_request)
d6983cb4 294 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
295 if errnote is False:
296 return False
d6983cb4 297 if errnote is None:
f1a9d64e
PH
298 errnote = 'Unable to download webpage'
299 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
300 if fatal:
301 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
302 else:
303 self._downloader.report_warning(errmsg)
304 return False
d6983cb4 305
7cc3570e 306 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 307 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
308 # Strip hashes from the URL (#1038)
309 if isinstance(url_or_request, (compat_str, str)):
310 url_or_request = url_or_request.partition('#')[0]
311
7cc3570e
PH
312 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
313 if urlh is False:
314 assert not fatal
315 return False
23be51d8
PH
316 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
317 return (content, urlh)
318
4e262a88 319 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
d6983cb4 320 content_type = urlh.headers.get('Content-Type', '')
f143d86a 321 webpage_bytes = urlh.read()
4e262a88
PH
322 if prefix is not None:
323 webpage_bytes = prefix + webpage_bytes
d6983cb4
PH
324 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
325 if m:
326 encoding = m.group(1)
327 else:
0d75ae2c 328 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
329 webpage_bytes[:1024])
330 if m:
331 encoding = m.group(1).decode('ascii')
b60016e8
PH
332 elif webpage_bytes.startswith(b'\xff\xfe'):
333 encoding = 'utf-16'
f143d86a
PH
334 else:
335 encoding = 'utf-8'
d6983cb4
PH
336 if self._downloader.params.get('dump_intermediate_pages', False):
337 try:
338 url = url_or_request.get_full_url()
339 except AttributeError:
340 url = url_or_request
f1a9d64e 341 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
342 dump = base64.b64encode(webpage_bytes).decode('ascii')
343 self._downloader.to_screen(dump)
d41e6efc
PH
344 if self._downloader.params.get('write_pages', False):
345 try:
346 url = url_or_request.get_full_url()
347 except AttributeError:
348 url = url_or_request
5afa7f8b 349 basen = '%s_%s' % (video_id, url)
c1bce22f 350 if len(basen) > 240:
f1a9d64e 351 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
352 basen = basen[:240 - len(h)] + h
353 raw_filename = basen + '.dump'
d41e6efc 354 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 355 self.to_screen('Saving request to ' + filename)
5f58165d
S
356 # Working around MAX_PATH limitation on Windows (see
357 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
358 if os.name == 'nt':
359 absfilepath = os.path.abspath(filename)
360 if len(absfilepath) > 259:
361 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
362 with open(filename, 'wb') as outf:
363 outf.write(webpage_bytes)
364
ec0fafbb
AA
365 try:
366 content = webpage_bytes.decode(encoding, 'replace')
367 except LookupError:
368 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 369
f1a9d64e
PH
370 if ('<title>Access to this site is blocked</title>' in content and
371 'Websense' in content[:512]):
372 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
373 blocked_iframe = self._html_search_regex(
374 r'<iframe src="([^"]+)"', content,
f1a9d64e 375 'Websense information URL', default=None)
2410c43d 376 if blocked_iframe:
f1a9d64e 377 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
378 raise ExtractorError(msg, expected=True)
379
23be51d8 380 return content
d6983cb4 381
995ad69c 382 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
d6983cb4 383 """ Returns the data of the page as a string """
995ad69c
TF
384 success = False
385 try_count = 0
386 while success is False:
387 try:
388 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
389 success = True
390 except compat_http_client.IncompleteRead as e:
391 try_count += 1
392 if try_count >= tries:
393 raise e
394 self._sleep(timeout, video_id)
7cc3570e
PH
395 if res is False:
396 return res
397 else:
398 content, _ = res
399 return content
d6983cb4 400
2a275ab0 401 def _download_xml(self, url_or_request, video_id,
f1a9d64e 402 note='Downloading XML', errnote='Unable to download XML',
28746fbd 403 transform_source=None, fatal=True):
267ed0c5 404 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
405 xml_string = self._download_webpage(
406 url_or_request, video_id, note, errnote, fatal=fatal)
407 if xml_string is False:
408 return xml_string
e2b38da9
PH
409 if transform_source:
410 xml_string = transform_source(xml_string)
267ed0c5
JMF
411 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
412
3d3538e4 413 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
414 note='Downloading JSON metadata',
415 errnote='Unable to download JSON metadata',
b090af59
PH
416 transform_source=None,
417 fatal=True):
418 json_string = self._download_webpage(
419 url_or_request, video_id, note, errnote, fatal=fatal)
420 if (not fatal) and json_string is False:
421 return None
ebb64199
TF
422 return self._parse_json(
423 json_string, video_id, transform_source=transform_source, fatal=fatal)
424
425 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
426 if transform_source:
427 json_string = transform_source(json_string)
3d3538e4
PH
428 try:
429 return json.loads(json_string)
430 except ValueError as ve:
e7b6d122
PH
431 errmsg = '%s: Failed to parse JSON ' % video_id
432 if fatal:
433 raise ExtractorError(errmsg, cause=ve)
434 else:
435 self.report_warning(errmsg + str(ve))
3d3538e4 436
f45f96f8 437 def report_warning(self, msg, video_id=None):
f1a9d64e 438 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 439 self._downloader.report_warning(
f1a9d64e 440 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 441
d6983cb4
PH
442 def to_screen(self, msg):
443 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 444 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
445
446 def report_extraction(self, id_or_name):
447 """Report information extraction."""
f1a9d64e 448 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
449
450 def report_download_webpage(self, video_id):
451 """Report webpage download."""
f1a9d64e 452 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
453
454 def report_age_confirmation(self):
455 """Report attempt to confirm age."""
f1a9d64e 456 self.to_screen('Confirming age')
d6983cb4 457
fc79158d
JMF
458 def report_login(self):
459 """Report attempt to log in."""
f1a9d64e 460 self.to_screen('Logging in')
fc79158d 461
5f6a1245 462 # Methods for following #608
c0d0b01f
JMF
463 @staticmethod
464 def url_result(url, ie=None, video_id=None):
d6983cb4 465 """Returns a url that points to a page that should be processed"""
5f6a1245 466 # TODO: ie should be the class used for getting the info
d6983cb4
PH
467 video_info = {'_type': 'url',
468 'url': url,
469 'ie_key': ie}
7012b23c
PH
470 if video_id is not None:
471 video_info['id'] = video_id
d6983cb4 472 return video_info
5f6a1245 473
c0d0b01f 474 @staticmethod
acf5cbfe 475 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
476 """Returns a playlist"""
477 video_info = {'_type': 'playlist',
478 'entries': entries}
479 if playlist_id:
480 video_info['id'] = playlist_id
481 if playlist_title:
482 video_info['title'] = playlist_title
acf5cbfe
S
483 if playlist_description:
484 video_info['description'] = playlist_description
d6983cb4
PH
485 return video_info
486
711ede6e 487 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
488 """
489 Perform a regex search on the given string, using a single or a list of
490 patterns returning the first matching group.
491 In case of failure return a default value or raise a WARNING or a
55b3e45b 492 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
493 """
494 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
495 mobj = re.search(pattern, string, flags)
496 else:
497 for p in pattern:
498 mobj = re.search(p, string, flags)
c3415d1b
PH
499 if mobj:
500 break
d6983cb4 501
87a28127 502 if os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 503 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
504 else:
505 _name = name
506
507 if mobj:
711ede6e
PH
508 if group is None:
509 # return the first matching group
510 return next(g for g in mobj.groups() if g is not None)
511 else:
512 return mobj.group(group)
46374a56 513 elif default is not _NO_DEFAULT:
d6983cb4
PH
514 return default
515 elif fatal:
f1a9d64e 516 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 517 else:
f1a9d64e 518 self._downloader.report_warning('unable to extract %s; '
9e1a5b84 519 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
520 return None
521
711ede6e 522 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
523 """
524 Like _search_regex, but strips HTML tags and unescapes entities.
525 """
711ede6e 526 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
527 if res:
528 return clean_html(res).strip()
529 else:
530 return res
531
fc79158d
JMF
532 def _get_login_info(self):
533 """
534 Get the the login info as (username, password)
535 It will look in the netrc file using the _NETRC_MACHINE value
536 If there's no info available, return (None, None)
537 """
538 if self._downloader is None:
539 return (None, None)
540
541 username = None
542 password = None
543 downloader_params = self._downloader.params
544
545 # Attempt to use provided username and password or .netrc data
546 if downloader_params.get('username', None) is not None:
547 username = downloader_params['username']
548 password = downloader_params['password']
549 elif downloader_params.get('usenetrc', False):
550 try:
551 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
552 if info is not None:
553 username = info[0]
554 password = info[2]
555 else:
556 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
557 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 558 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 559
fc79158d
JMF
560 return (username, password)
561
83317f69 562 def _get_tfa_info(self):
563 """
564 Get the two-factor authentication info
565 TODO - asking the user will be required for sms/phone verify
566 currently just uses the command line option
567 If there's no info available, return None
568 """
569 if self._downloader is None:
83317f69 570 return None
571 downloader_params = self._downloader.params
572
573 if downloader_params.get('twofactor', None) is not None:
574 return downloader_params['twofactor']
575
83317f69 576 return None
577
46720279
JMF
578 # Helper functions for extracting OpenGraph info
579 @staticmethod
ab2d5247 580 def _og_regexes(prop):
c1206423 581 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 582 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 583 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 584 return [
78fb87b2
JMF
585 template % (property_re, content_re),
586 template % (content_re, property_re),
ab2d5247 587 ]
46720279 588
3c4e6d83 589 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 590 if name is None:
3c4e6d83 591 name = 'OpenGraph %s' % prop
ab2d5247 592 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
593 if escaped is None:
594 return None
595 return unescapeHTML(escaped)
46720279
JMF
596
597 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 598 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
599
600 def _og_search_description(self, html, **kargs):
601 return self._og_search_property('description', html, fatal=False, **kargs)
602
603 def _og_search_title(self, html, **kargs):
604 return self._og_search_property('title', html, **kargs)
605
8ffa13e0 606 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
607 regexes = self._og_regexes('video') + self._og_regexes('video:url')
608 if secure:
609 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 610 return self._html_search_regex(regexes, html, name, **kargs)
46720279 611
78338f71
JMF
612 def _og_search_url(self, html, **kargs):
613 return self._og_search_property('url', html, **kargs)
614
40c696e5 615 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
616 if display_name is None:
617 display_name = name
618 return self._html_search_regex(
6c6f1408 619 r'''(?isx)<meta
711ede6e 620 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
bec22481 621 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
711ede6e 622 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
623
624 def _dc_search_uploader(self, html):
625 return self._html_search_meta('dc.creator', html, 'uploader')
626
8dbe9899
PH
627 def _rta_search(self, html):
628 # See http://www.rtalabel.org/index.php?content=howtofaq#single
629 if re.search(r'(?ix)<meta\s+name="rating"\s+'
630 r' content="RTA-5042-1996-1400-1577-RTA"',
631 html):
632 return 18
633 return 0
634
59040888
PH
635 def _media_rating_search(self, html):
636 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
637 rating = self._html_search_meta('rating', html)
638
639 if not rating:
640 return None
641
642 RATING_TABLE = {
643 'safe for kids': 0,
644 'general': 8,
645 '14 years': 14,
646 'mature': 17,
647 'restricted': 19,
648 }
649 return RATING_TABLE.get(rating.lower(), None)
650
0c708f11
JMF
651 def _twitter_search_player(self, html):
652 return self._html_search_meta('twitter:player', html,
9e1a5b84 653 'twitter card player')
0c708f11 654
4bcc7bd1 655 def _sort_formats(self, formats):
7e8caf30 656 if not formats:
f1a9d64e 657 raise ExtractorError('No video formats found')
7e8caf30 658
4bcc7bd1 659 def _formats_key(f):
e6812ac9
PH
660 # TODO remove the following workaround
661 from ..utils import determine_ext
662 if not f.get('ext') and 'url' in f:
663 f['ext'] = determine_ext(f['url'])
664
4bcc7bd1
PH
665 preference = f.get('preference')
666 if preference is None:
c7deaa4c
PH
667 proto = f.get('protocol')
668 if proto is None:
669 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
670
671 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
672 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
673 preference -= 0.5
674
675 if f.get('vcodec') == 'none': # audio only
676 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 677 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 678 else:
f1a9d64e 679 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
680 ext_preference = 0
681 try:
682 audio_ext_preference = ORDER.index(f['ext'])
683 except ValueError:
684 audio_ext_preference = -1
685 else:
686 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 687 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 688 else:
f1a9d64e 689 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
690 try:
691 ext_preference = ORDER.index(f['ext'])
692 except ValueError:
693 ext_preference = -1
694 audio_ext_preference = 0
695
696 return (
697 preference,
aff2f4f4 698 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 699 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
700 f.get('height') if f.get('height') is not None else -1,
701 f.get('width') if f.get('width') is not None else -1,
702 ext_preference,
9933b574 703 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
704 f.get('vbr') if f.get('vbr') is not None else -1,
705 f.get('abr') if f.get('abr') is not None else -1,
706 audio_ext_preference,
2c8e03d9 707 f.get('fps') if f.get('fps') is not None else -1,
4bcc7bd1 708 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 709 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 710 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
711 f.get('format_id'),
712 )
713 formats.sort(key=_formats_key)
59040888 714
20991253 715 def http_scheme(self):
1ede5b24 716 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
717 return (
718 'http:'
719 if self._downloader.params.get('prefer_insecure', False)
720 else 'https:')
721
57c7411f
PH
722 def _proto_relative_url(self, url, scheme=None):
723 if url is None:
724 return url
725 if url.startswith('//'):
726 if scheme is None:
727 scheme = self.http_scheme()
728 return scheme + url
729 else:
730 return url
731
4094b6e3
PH
732 def _sleep(self, timeout, video_id, msg_template=None):
733 if msg_template is None:
f1a9d64e 734 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
735 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
736 self.to_screen(msg)
737 time.sleep(timeout)
738
31bb8d3f 739 def _extract_f4m_formats(self, manifest_url, video_id):
f036a632
JMF
740 manifest = self._download_xml(
741 manifest_url, video_id, 'Downloading f4m manifest',
742 'Unable to download f4m manifest')
31bb8d3f
JMF
743
744 formats = []
b2527359
PH
745 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
746 for i, media_el in enumerate(media_nodes):
747 tbr = int_or_none(media_el.attrib.get('bitrate'))
748 format_id = 'f4m-%d' % (i if tbr is None else tbr)
31bb8d3f 749 formats.append({
b2527359 750 'format_id': format_id,
31bb8d3f
JMF
751 'url': manifest_url,
752 'ext': 'flv',
b2527359 753 'tbr': tbr,
31bb8d3f
JMF
754 'width': int_or_none(media_el.attrib.get('width')),
755 'height': int_or_none(media_el.attrib.get('height')),
756 })
757 self._sort_formats(formats)
758
759 return formats
760
f0b5d6af
PH
761 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
762 entry_protocol='m3u8', preference=None):
763
704df56d
PH
764 formats = [{
765 'format_id': 'm3u8-meta',
766 'url': m3u8_url,
767 'ext': ext,
768 'protocol': 'm3u8',
769 'preference': -1,
770 'resolution': 'multiple',
771 'format_note': 'Quality selection URL',
772 }]
773
f0b5d6af
PH
774 format_url = lambda u: (
775 u
776 if re.match(r'^https?://', u)
777 else compat_urlparse.urljoin(m3u8_url, u))
778
81515ad9
PH
779 m3u8_doc = self._download_webpage(
780 m3u8_url, video_id,
781 note='Downloading m3u8 information',
782 errnote='Failed to download m3u8 information')
704df56d
PH
783 last_info = None
784 kv_rex = re.compile(
785 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
786 for line in m3u8_doc.splitlines():
787 if line.startswith('#EXT-X-STREAM-INF:'):
788 last_info = {}
789 for m in kv_rex.finditer(line):
790 v = m.group('val')
791 if v.startswith('"'):
792 v = v[1:-1]
793 last_info[m.group('key')] = v
794 elif line.startswith('#') or not line.strip():
795 continue
796 else:
daebaab6 797 if last_info is None:
f0b5d6af 798 formats.append({'url': format_url(line)})
3524cc25 799 continue
704df56d
PH
800 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
801
802 f = {
803 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
f0b5d6af 804 'url': format_url(line.strip()),
704df56d
PH
805 'tbr': tbr,
806 'ext': ext,
f0b5d6af
PH
807 'protocol': entry_protocol,
808 'preference': preference,
704df56d
PH
809 }
810 codecs = last_info.get('CODECS')
811 if codecs:
9ebf22b7
S
812 # TODO: looks like video codec is not always necessarily goes first
813 va_codecs = codecs.split(',')
814 if va_codecs[0]:
815 f['vcodec'] = va_codecs[0].partition('.')[0]
816 if len(va_codecs) > 1 and va_codecs[1]:
817 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
818 resolution = last_info.get('RESOLUTION')
819 if resolution:
820 width_str, height_str = resolution.split('x')
821 f['width'] = int(width_str)
822 f['height'] = int(height_str)
823 formats.append(f)
824 last_info = {}
825 self._sort_formats(formats)
826 return formats
827
e89a2aab
S
828 # TODO: improve extraction
829 def _extract_smil_formats(self, smil_url, video_id):
830 smil = self._download_xml(
831 smil_url, video_id, 'Downloading SMIL file',
832 'Unable to download SMIL file')
833
834 base = smil.find('./head/meta').get('base')
835
836 formats = []
837 rtmp_count = 0
838 for video in smil.findall('./body/switch/video'):
839 src = video.get('src')
840 if not src:
841 continue
842 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
843 width = int_or_none(video.get('width'))
844 height = int_or_none(video.get('height'))
845 proto = video.get('proto')
846 if not proto:
847 if base:
848 if base.startswith('rtmp'):
849 proto = 'rtmp'
850 elif base.startswith('http'):
851 proto = 'http'
852 ext = video.get('ext')
853 if proto == 'm3u8':
854 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
855 elif proto == 'rtmp':
856 rtmp_count += 1
857 streamer = video.get('streamer') or base
858 formats.append({
859 'url': streamer,
860 'play_path': src,
861 'ext': 'flv',
862 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
863 'tbr': bitrate,
864 'width': width,
865 'height': height,
866 })
867 self._sort_formats(formats)
868
869 return formats
870
f4b1c7ad
PH
871 def _live_title(self, name):
872 """ Generate the title for a live video """
873 now = datetime.datetime.now()
874 now_str = now.strftime("%Y-%m-%d %H:%M")
875 return name + ' ' + now_str
876
b14f3a4c
PH
877 def _int(self, v, name, fatal=False, **kwargs):
878 res = int_or_none(v, **kwargs)
879 if 'get_attr' in kwargs:
880 print(getattr(v, kwargs['get_attr']))
881 if res is None:
882 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
883 if fatal:
884 raise ExtractorError(msg)
885 else:
886 self._downloader.report_warning(msg)
887 return res
888
889 def _float(self, v, name, fatal=False, **kwargs):
890 res = float_or_none(v, **kwargs)
891 if res is None:
892 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
893 if fatal:
894 raise ExtractorError(msg)
895 else:
896 self._downloader.report_warning(msg)
897 return res
898
42939b61 899 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
900 cookie = compat_cookiejar.Cookie(
901 0, name, value, None, None, domain, None,
42939b61
JMF
902 None, '/', True, False, expire_time, '', None, None, None)
903 self._downloader.cookiejar.set_cookie(cookie)
904
05900629
PH
905 def get_testcases(self, include_onlymatching=False):
906 t = getattr(self, '_TEST', None)
907 if t:
908 assert not hasattr(self, '_TESTS'), \
909 '%s has _TEST and _TESTS' % type(self).__name__
910 tests = [t]
911 else:
912 tests = getattr(self, '_TESTS', [])
913 for t in tests:
914 if not include_onlymatching and t.get('only_matching', False):
915 continue
916 t['name'] = type(self).__name__[:-len('IE')]
917 yield t
918
919 def is_suitable(self, age_limit):
920 """ Test whether the extractor is generally suitable for the given
921 age limit (i.e. pornographic sites are not, all others usually are) """
922
923 any_restricted = False
924 for tc in self.get_testcases(include_onlymatching=False):
925 if 'playlist' in tc:
926 tc = tc['playlist'][0]
927 is_restricted = age_restricted(
928 tc.get('info_dict', {}).get('age_limit'), age_limit)
929 if not is_restricted:
930 return True
931 any_restricted = any_restricted or is_restricted
932 return not any_restricted
933
8dbe9899 934
d6983cb4
PH
935class SearchInfoExtractor(InfoExtractor):
936 """
937 Base class for paged search queries extractors.
938 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
939 Instances should define _SEARCH_KEY and _MAX_RESULTS.
940 """
941
942 @classmethod
943 def _make_valid_url(cls):
944 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
945
946 @classmethod
947 def suitable(cls, url):
948 return re.match(cls._make_valid_url(), url) is not None
949
950 def _real_extract(self, query):
951 mobj = re.match(self._make_valid_url(), query)
952 if mobj is None:
f1a9d64e 953 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
954
955 prefix = mobj.group('prefix')
956 query = mobj.group('query')
957 if prefix == '':
958 return self._get_n_results(query, 1)
959 elif prefix == 'all':
960 return self._get_n_results(query, self._MAX_RESULTS)
961 else:
962 n = int(prefix)
963 if n <= 0:
f1a9d64e 964 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 965 elif n > self._MAX_RESULTS:
f1a9d64e 966 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
967 n = self._MAX_RESULTS
968 return self._get_n_results(query, n)
969
970 def _get_n_results(self, query, n):
971 """Get a specified number of results for a query"""
416a5efc 972 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
973
974 @property
975 def SEARCH_KEY(self):
976 return self._SEARCH_KEY