]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[youtube] Fix extraction of like and dislike count (fixes #3633)
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
3ec05685 4import hashlib
3d3538e4 5import json
4094b6e3 6import netrc
d6983cb4
PH
7import os
8import re
9import socket
10import sys
4094b6e3 11import time
267ed0c5 12import xml.etree.ElementTree
d6983cb4
PH
13
14from ..utils import (
15 compat_http_client,
16 compat_urllib_error,
c7deaa4c 17 compat_urllib_parse_urlparse,
d6983cb4
PH
18 compat_str,
19
20 clean_html,
21 compiled_regex_type,
22 ExtractorError,
31bb8d3f 23 int_or_none,
55b3e45b 24 RegexNotFoundError,
d41e6efc 25 sanitize_filename,
f38de77f 26 unescapeHTML,
d6983cb4 27)
46374a56 28_NO_DEFAULT = object()
d6983cb4 29
dca08720 30
d6983cb4
PH
31class InfoExtractor(object):
32 """Information Extractor class.
33
34 Information extractors are the classes that, given a URL, extract
35 information about the video (or videos) the URL refers to. This
36 information includes the real video URL, the video title, author and
37 others. The information is stored in a dictionary which is then
38 passed to the FileDownloader. The FileDownloader processes this
39 information possibly downloading the video to the file system, among
40 other possible outcomes.
41
42 The dictionaries must include the following fields:
43
44 id: Video identifier.
d6983cb4 45 title: Video title, unescaped.
d67b0b15 46
f49d89ee 47 Additionally, it must contain either a formats entry or a url one:
d67b0b15 48
f49d89ee
PH
49 formats: A list of dictionaries for each format available, ordered
50 from worst to best quality.
51
52 Potential fields:
d67b0b15
PH
53 * url Mandatory. The URL of the video file
54 * ext Will be calculated from url if missing
55 * format A human-readable description of the format
56 ("mp4 container with h264/opus").
57 Calculated from the format_id, width, height.
58 and format_note fields if missing.
59 * format_id A short description of the format
5d4f3985
PH
60 ("mp4_h264_opus" or "19").
61 Technically optional, but strongly recommended.
d67b0b15
PH
62 * format_note Additional info about the format
63 ("3D" or "DASH video")
64 * width Width of the video, if known
65 * height Height of the video, if known
f49d89ee 66 * resolution Textual description of width and height
7217e148 67 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
68 * abr Average audio bitrate in KBit/s
69 * acodec Name of the audio codec in use
dd27fd17 70 * asr Audio sampling rate in Hertz
d67b0b15
PH
71 * vbr Average video bitrate in KBit/s
72 * vcodec Name of the video codec in use
1394ce65 73 * container Name of the container format
d67b0b15 74 * filesize The number of bytes, if known in advance
9732d77e 75 * filesize_approx An estimate for the number of bytes
d67b0b15 76 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
77 * protocol The protocol that will be used for the actual
78 download, lower-case.
db1f3888 79 "http", "https", "rtsp", "rtmp", "m3u8" or so.
f49d89ee 80 * preference Order number of this format. If this field is
08d13955 81 present and not None, the formats get sorted
38d63d84 82 by this field, regardless of all other values.
f49d89ee
PH
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
5d73273f
PH
85 * quality Order number of the video quality of this
86 format, irrespective of the file format.
87 -1 for default (order by other properties),
88 -2 or smaller for less than default.
d769be6c
PH
89 * http_referer HTTP Referer header value to set.
90 * http_method HTTP method to use for the download.
91 * http_headers A dictionary of additional HTTP headers
92 to add to the request.
93 * http_post_data Additional data to send with a POST
94 request.
c0ba0f48 95 url: Final video URL.
d6983cb4 96 ext: Video filename extension.
d67b0b15
PH
97 format: The video format, defaults to ext (used for --get-format)
98 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 99
d6983cb4
PH
100 The following fields are optional:
101
0afef30b
PH
102 display_id An alternative identifier for the video, not necessarily
103 unique, but available before title. Typically, id is
104 something like "4234987", title "Dancing naked mole rats",
105 and display_id "dancing-naked-mole-rats"
d5519808
PH
106 thumbnails: A list of dictionaries, with the following entries:
107 * "url"
108 * "width" (optional, int)
109 * "height" (optional, int)
110 * "resolution" (optional, string "{width}x{height"},
111 deprecated)
d6983cb4
PH
112 thumbnail: Full URL to a video thumbnail image.
113 description: One-line video description.
114 uploader: Full name of the video uploader.
955c4514 115 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 116 upload_date: Video upload date (YYYYMMDD).
955c4514 117 If not explicitly set, calculated from timestamp.
d6983cb4 118 uploader_id: Nickname or id of the video uploader.
da9ec3b9 119 location: Physical location where the video was filmed.
5d51a883
JMF
120 subtitles: The subtitle file contents as a dictionary in the format
121 {language: subtitles}.
c0ba0f48 122 duration: Length of the video in seconds, as an integer.
f3d29461 123 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
124 like_count: Number of positive ratings of the video
125 dislike_count: Number of negative ratings of the video
126 comment_count: Number of comments on the video
8dbe9899 127 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
128 webpage_url: The url to the video webpage, if given to youtube-dl it
129 should allow to get the same result again. (It will be set
130 by YoutubeDL if it's missing)
ad3bc6ac
PH
131 categories: A list of categories that the video falls in, for example
132 ["Sports", "Berlin"]
d6983cb4 133
deefc05b 134 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
135
136 Subclasses of this one should re-define the _real_initialize() and
137 _real_extract() methods and define a _VALID_URL regexp.
138 Probably, they should also be added to the list of extractors.
139
d6983cb4
PH
140 Finally, the _WORKING attribute should be set to False for broken IEs
141 in order to warn the users and skip the tests.
142 """
143
144 _ready = False
145 _downloader = None
146 _WORKING = True
147
148 def __init__(self, downloader=None):
149 """Constructor. Receives an optional downloader."""
150 self._ready = False
151 self.set_downloader(downloader)
152
153 @classmethod
154 def suitable(cls, url):
155 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
156
157 # This does not use has/getattr intentionally - we want to know whether
158 # we have cached the regexp for *this* class, whereas getattr would also
159 # match the superclass
160 if '_VALID_URL_RE' not in cls.__dict__:
161 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
162 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
163
164 @classmethod
165 def working(cls):
166 """Getter method for _WORKING."""
167 return cls._WORKING
168
169 def initialize(self):
170 """Initializes an instance (authentication, etc)."""
171 if not self._ready:
172 self._real_initialize()
173 self._ready = True
174
175 def extract(self, url):
176 """Extracts URL information and returns it in list of dicts."""
177 self.initialize()
178 return self._real_extract(url)
179
180 def set_downloader(self, downloader):
181 """Sets the downloader for this IE."""
182 self._downloader = downloader
183
184 def _real_initialize(self):
185 """Real initialization process. Redefine in subclasses."""
186 pass
187
188 def _real_extract(self, url):
189 """Real extraction process. Redefine in subclasses."""
190 pass
191
56c73665
JMF
192 @classmethod
193 def ie_key(cls):
194 """A string for getting the InfoExtractor with get_info_extractor"""
195 return cls.__name__[:-2]
196
d6983cb4
PH
197 @property
198 def IE_NAME(self):
199 return type(self).__name__[:-2]
200
7cc3570e 201 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
202 """ Returns the response handle """
203 if note is None:
204 self.report_download_webpage(video_id)
205 elif note is not False:
7cc3570e 206 if video_id is None:
f1a9d64e 207 self.to_screen('%s' % (note,))
7cc3570e 208 else:
f1a9d64e 209 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 210 try:
dca08720 211 return self._downloader.urlopen(url_or_request)
d6983cb4 212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
213 if errnote is False:
214 return False
d6983cb4 215 if errnote is None:
f1a9d64e
PH
216 errnote = 'Unable to download webpage'
217 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
218 if fatal:
219 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
220 else:
221 self._downloader.report_warning(errmsg)
222 return False
d6983cb4 223
7cc3570e 224 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 225 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
226
227 # Strip hashes from the URL (#1038)
228 if isinstance(url_or_request, (compat_str, str)):
229 url_or_request = url_or_request.partition('#')[0]
230
7cc3570e
PH
231 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
232 if urlh is False:
233 assert not fatal
234 return False
d6983cb4 235 content_type = urlh.headers.get('Content-Type', '')
f143d86a 236 webpage_bytes = urlh.read()
d6983cb4
PH
237 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
238 if m:
239 encoding = m.group(1)
240 else:
0d75ae2c 241 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
242 webpage_bytes[:1024])
243 if m:
244 encoding = m.group(1).decode('ascii')
b60016e8
PH
245 elif webpage_bytes.startswith(b'\xff\xfe'):
246 encoding = 'utf-16'
f143d86a
PH
247 else:
248 encoding = 'utf-8'
d6983cb4
PH
249 if self._downloader.params.get('dump_intermediate_pages', False):
250 try:
251 url = url_or_request.get_full_url()
252 except AttributeError:
253 url = url_or_request
f1a9d64e 254 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
255 dump = base64.b64encode(webpage_bytes).decode('ascii')
256 self._downloader.to_screen(dump)
d41e6efc
PH
257 if self._downloader.params.get('write_pages', False):
258 try:
259 url = url_or_request.get_full_url()
260 except AttributeError:
261 url = url_or_request
5afa7f8b 262 basen = '%s_%s' % (video_id, url)
c1bce22f 263 if len(basen) > 240:
f1a9d64e 264 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
265 basen = basen[:240 - len(h)] + h
266 raw_filename = basen + '.dump'
d41e6efc 267 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 268 self.to_screen('Saving request to ' + filename)
d41e6efc
PH
269 with open(filename, 'wb') as outf:
270 outf.write(webpage_bytes)
271
ec0fafbb
AA
272 try:
273 content = webpage_bytes.decode(encoding, 'replace')
274 except LookupError:
275 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 276
f1a9d64e
PH
277 if ('<title>Access to this site is blocked</title>' in content and
278 'Websense' in content[:512]):
279 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
280 blocked_iframe = self._html_search_regex(
281 r'<iframe src="([^"]+)"', content,
f1a9d64e 282 'Websense information URL', default=None)
2410c43d 283 if blocked_iframe:
f1a9d64e 284 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
285 raise ExtractorError(msg, expected=True)
286
d6983cb4
PH
287 return (content, urlh)
288
7cc3570e 289 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 290 """ Returns the data of the page as a string """
7cc3570e
PH
291 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
292 if res is False:
293 return res
294 else:
295 content, _ = res
296 return content
d6983cb4 297
2a275ab0 298 def _download_xml(self, url_or_request, video_id,
f1a9d64e 299 note='Downloading XML', errnote='Unable to download XML',
28746fbd 300 transform_source=None, fatal=True):
267ed0c5 301 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
302 xml_string = self._download_webpage(
303 url_or_request, video_id, note, errnote, fatal=fatal)
304 if xml_string is False:
305 return xml_string
e2b38da9
PH
306 if transform_source:
307 xml_string = transform_source(xml_string)
267ed0c5
JMF
308 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
309
3d3538e4 310 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
311 note='Downloading JSON metadata',
312 errnote='Unable to download JSON metadata',
b090af59
PH
313 transform_source=None,
314 fatal=True):
315 json_string = self._download_webpage(
316 url_or_request, video_id, note, errnote, fatal=fatal)
317 if (not fatal) and json_string is False:
318 return None
81c2f20b
PH
319 if transform_source:
320 json_string = transform_source(json_string)
3d3538e4
PH
321 try:
322 return json.loads(json_string)
323 except ValueError as ve:
324 raise ExtractorError('Failed to download JSON', cause=ve)
325
f45f96f8 326 def report_warning(self, msg, video_id=None):
f1a9d64e 327 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 328 self._downloader.report_warning(
f1a9d64e 329 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 330
d6983cb4
PH
331 def to_screen(self, msg):
332 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 333 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
334
335 def report_extraction(self, id_or_name):
336 """Report information extraction."""
f1a9d64e 337 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
338
339 def report_download_webpage(self, video_id):
340 """Report webpage download."""
f1a9d64e 341 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
342
343 def report_age_confirmation(self):
344 """Report attempt to confirm age."""
f1a9d64e 345 self.to_screen('Confirming age')
d6983cb4 346
fc79158d
JMF
347 def report_login(self):
348 """Report attempt to log in."""
f1a9d64e 349 self.to_screen('Logging in')
fc79158d 350
d6983cb4 351 #Methods for following #608
c0d0b01f
JMF
352 @staticmethod
353 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
354 """Returns a url that points to a page that should be processed"""
355 #TODO: ie should be the class used for getting the info
356 video_info = {'_type': 'url',
357 'url': url,
358 'ie_key': ie}
7012b23c
PH
359 if video_id is not None:
360 video_info['id'] = video_id
d6983cb4 361 return video_info
c0d0b01f
JMF
362 @staticmethod
363 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
364 """Returns a playlist"""
365 video_info = {'_type': 'playlist',
366 'entries': entries}
367 if playlist_id:
368 video_info['id'] = playlist_id
369 if playlist_title:
370 video_info['title'] = playlist_title
371 return video_info
372
46374a56 373 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
374 """
375 Perform a regex search on the given string, using a single or a list of
376 patterns returning the first matching group.
377 In case of failure return a default value or raise a WARNING or a
55b3e45b 378 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
379 """
380 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
381 mobj = re.search(pattern, string, flags)
382 else:
383 for p in pattern:
384 mobj = re.search(p, string, flags)
c3415d1b
PH
385 if mobj:
386 break
d6983cb4 387
87a28127 388 if os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 389 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
390 else:
391 _name = name
392
393 if mobj:
394 # return the first matching group
395 return next(g for g in mobj.groups() if g is not None)
46374a56 396 elif default is not _NO_DEFAULT:
d6983cb4
PH
397 return default
398 elif fatal:
f1a9d64e 399 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 400 else:
f1a9d64e
PH
401 self._downloader.report_warning('unable to extract %s; '
402 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
403 return None
404
46374a56 405 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
406 """
407 Like _search_regex, but strips HTML tags and unescapes entities.
408 """
409 res = self._search_regex(pattern, string, name, default, fatal, flags)
410 if res:
411 return clean_html(res).strip()
412 else:
413 return res
414
fc79158d
JMF
415 def _get_login_info(self):
416 """
417 Get the the login info as (username, password)
418 It will look in the netrc file using the _NETRC_MACHINE value
419 If there's no info available, return (None, None)
420 """
421 if self._downloader is None:
422 return (None, None)
423
424 username = None
425 password = None
426 downloader_params = self._downloader.params
427
428 # Attempt to use provided username and password or .netrc data
429 if downloader_params.get('username', None) is not None:
430 username = downloader_params['username']
431 password = downloader_params['password']
432 elif downloader_params.get('usenetrc', False):
433 try:
434 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
435 if info is not None:
436 username = info[0]
437 password = info[2]
438 else:
439 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
440 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 441 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
fc79158d
JMF
442
443 return (username, password)
444
83317f69 445 def _get_tfa_info(self):
446 """
447 Get the two-factor authentication info
448 TODO - asking the user will be required for sms/phone verify
449 currently just uses the command line option
450 If there's no info available, return None
451 """
452 if self._downloader is None:
83317f69 453 return None
454 downloader_params = self._downloader.params
455
456 if downloader_params.get('twofactor', None) is not None:
457 return downloader_params['twofactor']
458
83317f69 459 return None
460
46720279
JMF
461 # Helper functions for extracting OpenGraph info
462 @staticmethod
ab2d5247 463 def _og_regexes(prop):
c1206423 464 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 465 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 466 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 467 return [
78fb87b2
JMF
468 template % (property_re, content_re),
469 template % (content_re, property_re),
ab2d5247 470 ]
46720279 471
3c4e6d83 472 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 473 if name is None:
3c4e6d83 474 name = 'OpenGraph %s' % prop
ab2d5247 475 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
476 if escaped is None:
477 return None
478 return unescapeHTML(escaped)
46720279
JMF
479
480 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 481 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
482
483 def _og_search_description(self, html, **kargs):
484 return self._og_search_property('description', html, fatal=False, **kargs)
485
486 def _og_search_title(self, html, **kargs):
487 return self._og_search_property('title', html, **kargs)
488
8ffa13e0 489 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
490 regexes = self._og_regexes('video') + self._og_regexes('video:url')
491 if secure:
492 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 493 return self._html_search_regex(regexes, html, name, **kargs)
46720279 494
78338f71
JMF
495 def _og_search_url(self, html, **kargs):
496 return self._og_search_property('url', html, **kargs)
497
40c696e5 498 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
499 if display_name is None:
500 display_name = name
501 return self._html_search_regex(
aaebed13 502 r'''(?ix)<meta
1a30deca 503 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
59040888 504 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
40c696e5 505 html, display_name, fatal=fatal, **kwargs)
59040888
PH
506
507 def _dc_search_uploader(self, html):
508 return self._html_search_meta('dc.creator', html, 'uploader')
509
8dbe9899
PH
510 def _rta_search(self, html):
511 # See http://www.rtalabel.org/index.php?content=howtofaq#single
512 if re.search(r'(?ix)<meta\s+name="rating"\s+'
513 r' content="RTA-5042-1996-1400-1577-RTA"',
514 html):
515 return 18
516 return 0
517
59040888
PH
518 def _media_rating_search(self, html):
519 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
520 rating = self._html_search_meta('rating', html)
521
522 if not rating:
523 return None
524
525 RATING_TABLE = {
526 'safe for kids': 0,
527 'general': 8,
528 '14 years': 14,
529 'mature': 17,
530 'restricted': 19,
531 }
532 return RATING_TABLE.get(rating.lower(), None)
533
0c708f11
JMF
534 def _twitter_search_player(self, html):
535 return self._html_search_meta('twitter:player', html,
536 'twitter card player')
537
4bcc7bd1 538 def _sort_formats(self, formats):
7e8caf30 539 if not formats:
f1a9d64e 540 raise ExtractorError('No video formats found')
7e8caf30 541
4bcc7bd1 542 def _formats_key(f):
e6812ac9
PH
543 # TODO remove the following workaround
544 from ..utils import determine_ext
545 if not f.get('ext') and 'url' in f:
546 f['ext'] = determine_ext(f['url'])
547
4bcc7bd1
PH
548 preference = f.get('preference')
549 if preference is None:
c7deaa4c
PH
550 proto = f.get('protocol')
551 if proto is None:
552 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
553
554 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
555 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
556 preference -= 0.5
557
558 if f.get('vcodec') == 'none': # audio only
559 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 560 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 561 else:
f1a9d64e 562 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
563 ext_preference = 0
564 try:
565 audio_ext_preference = ORDER.index(f['ext'])
566 except ValueError:
567 audio_ext_preference = -1
568 else:
569 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 570 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 571 else:
f1a9d64e 572 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
573 try:
574 ext_preference = ORDER.index(f['ext'])
575 except ValueError:
576 ext_preference = -1
577 audio_ext_preference = 0
578
579 return (
580 preference,
5d73273f 581 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
582 f.get('height') if f.get('height') is not None else -1,
583 f.get('width') if f.get('width') is not None else -1,
584 ext_preference,
9933b574 585 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
586 f.get('vbr') if f.get('vbr') is not None else -1,
587 f.get('abr') if f.get('abr') is not None else -1,
588 audio_ext_preference,
589 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 590 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
4bcc7bd1
PH
591 f.get('format_id'),
592 )
593 formats.sort(key=_formats_key)
59040888 594
20991253
PH
595 def http_scheme(self):
596 """ Either "https:" or "https:", depending on the user's preferences """
597 return (
598 'http:'
599 if self._downloader.params.get('prefer_insecure', False)
600 else 'https:')
601
57c7411f
PH
602 def _proto_relative_url(self, url, scheme=None):
603 if url is None:
604 return url
605 if url.startswith('//'):
606 if scheme is None:
607 scheme = self.http_scheme()
608 return scheme + url
609 else:
610 return url
611
4094b6e3
PH
612 def _sleep(self, timeout, video_id, msg_template=None):
613 if msg_template is None:
f1a9d64e 614 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
615 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
616 self.to_screen(msg)
617 time.sleep(timeout)
618
31bb8d3f 619 def _extract_f4m_formats(self, manifest_url, video_id):
f036a632
JMF
620 manifest = self._download_xml(
621 manifest_url, video_id, 'Downloading f4m manifest',
622 'Unable to download f4m manifest')
31bb8d3f
JMF
623
624 formats = []
b2527359
PH
625 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
626 for i, media_el in enumerate(media_nodes):
627 tbr = int_or_none(media_el.attrib.get('bitrate'))
628 format_id = 'f4m-%d' % (i if tbr is None else tbr)
31bb8d3f 629 formats.append({
b2527359 630 'format_id': format_id,
31bb8d3f
JMF
631 'url': manifest_url,
632 'ext': 'flv',
b2527359 633 'tbr': tbr,
31bb8d3f
JMF
634 'width': int_or_none(media_el.attrib.get('width')),
635 'height': int_or_none(media_el.attrib.get('height')),
636 })
637 self._sort_formats(formats)
638
639 return formats
640
704df56d
PH
641 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
642 formats = [{
643 'format_id': 'm3u8-meta',
644 'url': m3u8_url,
645 'ext': ext,
646 'protocol': 'm3u8',
647 'preference': -1,
648 'resolution': 'multiple',
649 'format_note': 'Quality selection URL',
650 }]
651
652 m3u8_doc = self._download_webpage(m3u8_url, video_id)
653 last_info = None
654 kv_rex = re.compile(
655 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
656 for line in m3u8_doc.splitlines():
657 if line.startswith('#EXT-X-STREAM-INF:'):
658 last_info = {}
659 for m in kv_rex.finditer(line):
660 v = m.group('val')
661 if v.startswith('"'):
662 v = v[1:-1]
663 last_info[m.group('key')] = v
664 elif line.startswith('#') or not line.strip():
665 continue
666 else:
daebaab6 667 if last_info is None:
3524cc25
PH
668 formats.append({'url': line})
669 continue
704df56d
PH
670 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
671
672 f = {
673 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
674 'url': line.strip(),
675 'tbr': tbr,
676 'ext': ext,
677 }
678 codecs = last_info.get('CODECS')
679 if codecs:
680 video, audio = codecs.split(',')
681 f['vcodec'] = video.partition('.')[0]
682 f['acodec'] = audio.partition('.')[0]
683 resolution = last_info.get('RESOLUTION')
684 if resolution:
685 width_str, height_str = resolution.split('x')
686 f['width'] = int(width_str)
687 f['height'] = int(height_str)
688 formats.append(f)
689 last_info = {}
690 self._sort_formats(formats)
691 return formats
692
8dbe9899 693
d6983cb4
PH
694class SearchInfoExtractor(InfoExtractor):
695 """
696 Base class for paged search queries extractors.
697 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
698 Instances should define _SEARCH_KEY and _MAX_RESULTS.
699 """
700
701 @classmethod
702 def _make_valid_url(cls):
703 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
704
705 @classmethod
706 def suitable(cls, url):
707 return re.match(cls._make_valid_url(), url) is not None
708
709 def _real_extract(self, query):
710 mobj = re.match(self._make_valid_url(), query)
711 if mobj is None:
f1a9d64e 712 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
713
714 prefix = mobj.group('prefix')
715 query = mobj.group('query')
716 if prefix == '':
717 return self._get_n_results(query, 1)
718 elif prefix == 'all':
719 return self._get_n_results(query, self._MAX_RESULTS)
720 else:
721 n = int(prefix)
722 if n <= 0:
f1a9d64e 723 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 724 elif n > self._MAX_RESULTS:
f1a9d64e 725 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
726 n = self._MAX_RESULTS
727 return self._get_n_results(query, n)
728
729 def _get_n_results(self, query, n):
730 """Get a specified number of results for a query"""
416a5efc 731 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
732
733 @property
734 def SEARCH_KEY(self):
735 return self._SEARCH_KEY