]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[adultswim] PEP8
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
d6983cb4
PH
17 compat_http_client,
18 compat_urllib_error,
c7deaa4c 19 compat_urllib_parse_urlparse,
f0b5d6af 20 compat_urlparse,
d6983cb4 21 compat_str,
8c25f81b
PH
22)
23from ..utils import (
d6983cb4
PH
24 clean_html,
25 compiled_regex_type,
26 ExtractorError,
b14f3a4c 27 float_or_none,
31bb8d3f 28 int_or_none,
55b3e45b 29 RegexNotFoundError,
d41e6efc 30 sanitize_filename,
f38de77f 31 unescapeHTML,
d6983cb4 32)
46374a56 33_NO_DEFAULT = object()
d6983cb4 34
dca08720 35
d6983cb4
PH
36class InfoExtractor(object):
37 """Information Extractor class.
38
39 Information extractors are the classes that, given a URL, extract
40 information about the video (or videos) the URL refers to. This
41 information includes the real video URL, the video title, author and
42 others. The information is stored in a dictionary which is then
43 passed to the FileDownloader. The FileDownloader processes this
44 information possibly downloading the video to the file system, among
45 other possible outcomes.
46
fed5d032
PH
47 The type field determines the the type of the result.
48 By far the most common value (and the default if _type is missing) is
49 "video", which indicates a single video.
50
51 For a video, the dictionaries must include the following fields:
d6983cb4
PH
52
53 id: Video identifier.
d6983cb4 54 title: Video title, unescaped.
d67b0b15 55
f49d89ee 56 Additionally, it must contain either a formats entry or a url one:
d67b0b15 57
f49d89ee
PH
58 formats: A list of dictionaries for each format available, ordered
59 from worst to best quality.
60
61 Potential fields:
d67b0b15
PH
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
5d4f3985
PH
69 ("mp4_h264_opus" or "19").
70 Technically optional, but strongly recommended.
d67b0b15
PH
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
f49d89ee 75 * resolution Textual description of width and height
7217e148 76 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
77 * abr Average audio bitrate in KBit/s
78 * acodec Name of the audio codec in use
dd27fd17 79 * asr Audio sampling rate in Hertz
d67b0b15 80 * vbr Average video bitrate in KBit/s
fbb21cf5 81 * fps Frame rate
d67b0b15 82 * vcodec Name of the video codec in use
1394ce65 83 * container Name of the container format
d67b0b15 84 * filesize The number of bytes, if known in advance
9732d77e 85 * filesize_approx An estimate for the number of bytes
d67b0b15 86 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
87 * protocol The protocol that will be used for the actual
88 download, lower-case.
db1f3888 89 "http", "https", "rtsp", "rtmp", "m3u8" or so.
f49d89ee 90 * preference Order number of this format. If this field is
08d13955 91 present and not None, the formats get sorted
38d63d84 92 by this field, regardless of all other values.
f49d89ee
PH
93 -1 for default (order by other properties),
94 -2 or smaller for less than default.
aff2f4f4
PH
95 * language_preference Is this in the correct requested
96 language?
97 10 if it's what the URL is about,
98 -1 for default (don't know),
99 -10 otherwise, other values reserved for now.
5d73273f
PH
100 * quality Order number of the video quality of this
101 format, irrespective of the file format.
102 -1 for default (order by other properties),
103 -2 or smaller for less than default.
c64ed2a3
PH
104 * source_preference Order number for this video source
105 (quality takes higher priority)
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
d769be6c
PH
108 * http_referer HTTP Referer header value to set.
109 * http_method HTTP method to use for the download.
110 * http_headers A dictionary of additional HTTP headers
111 to add to the request.
112 * http_post_data Additional data to send with a POST
113 request.
c0ba0f48 114 url: Final video URL.
d6983cb4 115 ext: Video filename extension.
d67b0b15
PH
116 format: The video format, defaults to ext (used for --get-format)
117 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 118
d6983cb4
PH
119 The following fields are optional:
120
0afef30b
PH
121 display_id An alternative identifier for the video, not necessarily
122 unique, but available before title. Typically, id is
123 something like "4234987", title "Dancing naked mole rats",
124 and display_id "dancing-naked-mole-rats"
d5519808
PH
125 thumbnails: A list of dictionaries, with the following entries:
126 * "url"
127 * "width" (optional, int)
128 * "height" (optional, int)
129 * "resolution" (optional, string "{width}x{height"},
130 deprecated)
d6983cb4
PH
131 thumbnail: Full URL to a video thumbnail image.
132 description: One-line video description.
133 uploader: Full name of the video uploader.
955c4514 134 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 135 upload_date: Video upload date (YYYYMMDD).
955c4514 136 If not explicitly set, calculated from timestamp.
d6983cb4 137 uploader_id: Nickname or id of the video uploader.
da9ec3b9 138 location: Physical location where the video was filmed.
5d51a883
JMF
139 subtitles: The subtitle file contents as a dictionary in the format
140 {language: subtitles}.
c0ba0f48 141 duration: Length of the video in seconds, as an integer.
f3d29461 142 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
143 like_count: Number of positive ratings of the video
144 dislike_count: Number of negative ratings of the video
145 comment_count: Number of comments on the video
8dbe9899 146 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
147 webpage_url: The url to the video webpage, if given to youtube-dl it
148 should allow to get the same result again. (It will be set
149 by YoutubeDL if it's missing)
ad3bc6ac
PH
150 categories: A list of categories that the video falls in, for example
151 ["Sports", "Berlin"]
7267bd53
PH
152 is_live: True, False, or None (=unknown). Whether this video is a
153 live stream that goes on instead of a fixed-length video.
d6983cb4 154
deefc05b 155 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 156
d838b1bd
PH
157 Unless mentioned otherwise, None is equivalent to absence of information.
158
fed5d032
PH
159
160 _type "playlist" indicates multiple videos.
161 There must be a key "entries", which is a list or a PagedList object, each
162 element of which is a valid dictionary under this specfication.
163
164 Additionally, playlists can have "title" and "id" attributes with the same
165 semantics as videos (see above).
166
167
168 _type "multi_video" indicates that there are multiple videos that
169 form a single show, for examples multiple acts of an opera or TV episode.
170 It must have an entries key like a playlist and contain all the keys
171 required for a video at the same time.
172
173
174 _type "url" indicates that the video must be extracted from another
175 location, possibly by a different extractor. Its only required key is:
176 "url" - the next URL to extract.
177
178 Additionally, it may have properties believed to be identical to the
179 resolved entity, for example "title" if the title of the referred video is
180 known ahead of time.
181
182
183 _type "url_transparent" entities have the same specification as "url", but
184 indicate that the given additional information is more precise than the one
185 associated with the resolved URL.
186 This is useful when a site employs a video service that hosts the video and
187 its technical metadata, but that video service does not embed a useful
188 title, description etc.
189
190
d6983cb4
PH
191 Subclasses of this one should re-define the _real_initialize() and
192 _real_extract() methods and define a _VALID_URL regexp.
193 Probably, they should also be added to the list of extractors.
194
d6983cb4
PH
195 Finally, the _WORKING attribute should be set to False for broken IEs
196 in order to warn the users and skip the tests.
197 """
198
199 _ready = False
200 _downloader = None
201 _WORKING = True
202
203 def __init__(self, downloader=None):
204 """Constructor. Receives an optional downloader."""
205 self._ready = False
206 self.set_downloader(downloader)
207
208 @classmethod
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
211
212 # This does not use has/getattr intentionally - we want to know whether
213 # we have cached the regexp for *this* class, whereas getattr would also
214 # match the superclass
215 if '_VALID_URL_RE' not in cls.__dict__:
216 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
217 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 218
ed9266db
PH
219 @classmethod
220 def _match_id(cls, url):
221 if '_VALID_URL_RE' not in cls.__dict__:
222 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
223 m = cls._VALID_URL_RE.match(url)
224 assert m
225 return m.group('id')
226
d6983cb4
PH
227 @classmethod
228 def working(cls):
229 """Getter method for _WORKING."""
230 return cls._WORKING
231
232 def initialize(self):
233 """Initializes an instance (authentication, etc)."""
234 if not self._ready:
235 self._real_initialize()
236 self._ready = True
237
238 def extract(self, url):
239 """Extracts URL information and returns it in list of dicts."""
240 self.initialize()
241 return self._real_extract(url)
242
243 def set_downloader(self, downloader):
244 """Sets the downloader for this IE."""
245 self._downloader = downloader
246
247 def _real_initialize(self):
248 """Real initialization process. Redefine in subclasses."""
249 pass
250
251 def _real_extract(self, url):
252 """Real extraction process. Redefine in subclasses."""
253 pass
254
56c73665
JMF
255 @classmethod
256 def ie_key(cls):
257 """A string for getting the InfoExtractor with get_info_extractor"""
258 return cls.__name__[:-2]
259
d6983cb4
PH
260 @property
261 def IE_NAME(self):
262 return type(self).__name__[:-2]
263
7cc3570e 264 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
265 """ Returns the response handle """
266 if note is None:
267 self.report_download_webpage(video_id)
268 elif note is not False:
7cc3570e 269 if video_id is None:
f1a9d64e 270 self.to_screen('%s' % (note,))
7cc3570e 271 else:
f1a9d64e 272 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 273 try:
dca08720 274 return self._downloader.urlopen(url_or_request)
d6983cb4 275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
276 if errnote is False:
277 return False
d6983cb4 278 if errnote is None:
f1a9d64e
PH
279 errnote = 'Unable to download webpage'
280 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
281 if fatal:
282 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
283 else:
284 self._downloader.report_warning(errmsg)
285 return False
d6983cb4 286
7cc3570e 287 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 288 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
289 # Strip hashes from the URL (#1038)
290 if isinstance(url_or_request, (compat_str, str)):
291 url_or_request = url_or_request.partition('#')[0]
292
7cc3570e
PH
293 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
294 if urlh is False:
295 assert not fatal
296 return False
23be51d8
PH
297 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
298 return (content, urlh)
299
4e262a88 300 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
d6983cb4 301 content_type = urlh.headers.get('Content-Type', '')
f143d86a 302 webpage_bytes = urlh.read()
4e262a88
PH
303 if prefix is not None:
304 webpage_bytes = prefix + webpage_bytes
d6983cb4
PH
305 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
306 if m:
307 encoding = m.group(1)
308 else:
0d75ae2c 309 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
310 webpage_bytes[:1024])
311 if m:
312 encoding = m.group(1).decode('ascii')
b60016e8
PH
313 elif webpage_bytes.startswith(b'\xff\xfe'):
314 encoding = 'utf-16'
f143d86a
PH
315 else:
316 encoding = 'utf-8'
d6983cb4
PH
317 if self._downloader.params.get('dump_intermediate_pages', False):
318 try:
319 url = url_or_request.get_full_url()
320 except AttributeError:
321 url = url_or_request
f1a9d64e 322 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
323 dump = base64.b64encode(webpage_bytes).decode('ascii')
324 self._downloader.to_screen(dump)
d41e6efc
PH
325 if self._downloader.params.get('write_pages', False):
326 try:
327 url = url_or_request.get_full_url()
328 except AttributeError:
329 url = url_or_request
5afa7f8b 330 basen = '%s_%s' % (video_id, url)
c1bce22f 331 if len(basen) > 240:
f1a9d64e 332 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
333 basen = basen[:240 - len(h)] + h
334 raw_filename = basen + '.dump'
d41e6efc 335 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 336 self.to_screen('Saving request to ' + filename)
5f58165d
S
337 # Working around MAX_PATH limitation on Windows (see
338 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
339 if os.name == 'nt':
340 absfilepath = os.path.abspath(filename)
341 if len(absfilepath) > 259:
342 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
343 with open(filename, 'wb') as outf:
344 outf.write(webpage_bytes)
345
ec0fafbb
AA
346 try:
347 content = webpage_bytes.decode(encoding, 'replace')
348 except LookupError:
349 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 350
f1a9d64e
PH
351 if ('<title>Access to this site is blocked</title>' in content and
352 'Websense' in content[:512]):
353 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
354 blocked_iframe = self._html_search_regex(
355 r'<iframe src="([^"]+)"', content,
f1a9d64e 356 'Websense information URL', default=None)
2410c43d 357 if blocked_iframe:
f1a9d64e 358 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
359 raise ExtractorError(msg, expected=True)
360
23be51d8 361 return content
d6983cb4 362
7cc3570e 363 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 364 """ Returns the data of the page as a string """
7cc3570e
PH
365 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
366 if res is False:
367 return res
368 else:
369 content, _ = res
370 return content
d6983cb4 371
2a275ab0 372 def _download_xml(self, url_or_request, video_id,
f1a9d64e 373 note='Downloading XML', errnote='Unable to download XML',
28746fbd 374 transform_source=None, fatal=True):
267ed0c5 375 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
376 xml_string = self._download_webpage(
377 url_or_request, video_id, note, errnote, fatal=fatal)
378 if xml_string is False:
379 return xml_string
e2b38da9
PH
380 if transform_source:
381 xml_string = transform_source(xml_string)
267ed0c5
JMF
382 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
383
3d3538e4 384 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
385 note='Downloading JSON metadata',
386 errnote='Unable to download JSON metadata',
b090af59
PH
387 transform_source=None,
388 fatal=True):
389 json_string = self._download_webpage(
390 url_or_request, video_id, note, errnote, fatal=fatal)
391 if (not fatal) and json_string is False:
392 return None
81c2f20b
PH
393 if transform_source:
394 json_string = transform_source(json_string)
3d3538e4
PH
395 try:
396 return json.loads(json_string)
397 except ValueError as ve:
e7b6d122
PH
398 errmsg = '%s: Failed to parse JSON ' % video_id
399 if fatal:
400 raise ExtractorError(errmsg, cause=ve)
401 else:
402 self.report_warning(errmsg + str(ve))
3d3538e4 403
f45f96f8 404 def report_warning(self, msg, video_id=None):
f1a9d64e 405 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 406 self._downloader.report_warning(
f1a9d64e 407 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 408
d6983cb4
PH
409 def to_screen(self, msg):
410 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 411 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
412
413 def report_extraction(self, id_or_name):
414 """Report information extraction."""
f1a9d64e 415 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
416
417 def report_download_webpage(self, video_id):
418 """Report webpage download."""
f1a9d64e 419 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
420
421 def report_age_confirmation(self):
422 """Report attempt to confirm age."""
f1a9d64e 423 self.to_screen('Confirming age')
d6983cb4 424
fc79158d
JMF
425 def report_login(self):
426 """Report attempt to log in."""
f1a9d64e 427 self.to_screen('Logging in')
fc79158d 428
5f6a1245 429 # Methods for following #608
c0d0b01f
JMF
430 @staticmethod
431 def url_result(url, ie=None, video_id=None):
d6983cb4 432 """Returns a url that points to a page that should be processed"""
5f6a1245 433 # TODO: ie should be the class used for getting the info
d6983cb4
PH
434 video_info = {'_type': 'url',
435 'url': url,
436 'ie_key': ie}
7012b23c
PH
437 if video_id is not None:
438 video_info['id'] = video_id
d6983cb4 439 return video_info
5f6a1245 440
c0d0b01f
JMF
441 @staticmethod
442 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
443 """Returns a playlist"""
444 video_info = {'_type': 'playlist',
445 'entries': entries}
446 if playlist_id:
447 video_info['id'] = playlist_id
448 if playlist_title:
449 video_info['title'] = playlist_title
450 return video_info
451
711ede6e 452 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
453 """
454 Perform a regex search on the given string, using a single or a list of
455 patterns returning the first matching group.
456 In case of failure return a default value or raise a WARNING or a
55b3e45b 457 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
458 """
459 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
460 mobj = re.search(pattern, string, flags)
461 else:
462 for p in pattern:
463 mobj = re.search(p, string, flags)
c3415d1b
PH
464 if mobj:
465 break
d6983cb4 466
87a28127 467 if os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 468 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
469 else:
470 _name = name
471
472 if mobj:
711ede6e
PH
473 if group is None:
474 # return the first matching group
475 return next(g for g in mobj.groups() if g is not None)
476 else:
477 return mobj.group(group)
46374a56 478 elif default is not _NO_DEFAULT:
d6983cb4
PH
479 return default
480 elif fatal:
f1a9d64e 481 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 482 else:
f1a9d64e 483 self._downloader.report_warning('unable to extract %s; '
9e1a5b84 484 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
485 return None
486
711ede6e 487 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
488 """
489 Like _search_regex, but strips HTML tags and unescapes entities.
490 """
711ede6e 491 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
492 if res:
493 return clean_html(res).strip()
494 else:
495 return res
496
fc79158d
JMF
497 def _get_login_info(self):
498 """
499 Get the the login info as (username, password)
500 It will look in the netrc file using the _NETRC_MACHINE value
501 If there's no info available, return (None, None)
502 """
503 if self._downloader is None:
504 return (None, None)
505
506 username = None
507 password = None
508 downloader_params = self._downloader.params
509
510 # Attempt to use provided username and password or .netrc data
511 if downloader_params.get('username', None) is not None:
512 username = downloader_params['username']
513 password = downloader_params['password']
514 elif downloader_params.get('usenetrc', False):
515 try:
516 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
517 if info is not None:
518 username = info[0]
519 password = info[2]
520 else:
521 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
522 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 523 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 524
fc79158d
JMF
525 return (username, password)
526
83317f69 527 def _get_tfa_info(self):
528 """
529 Get the two-factor authentication info
530 TODO - asking the user will be required for sms/phone verify
531 currently just uses the command line option
532 If there's no info available, return None
533 """
534 if self._downloader is None:
83317f69 535 return None
536 downloader_params = self._downloader.params
537
538 if downloader_params.get('twofactor', None) is not None:
539 return downloader_params['twofactor']
540
83317f69 541 return None
542
46720279
JMF
543 # Helper functions for extracting OpenGraph info
544 @staticmethod
ab2d5247 545 def _og_regexes(prop):
c1206423 546 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 547 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 548 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 549 return [
78fb87b2
JMF
550 template % (property_re, content_re),
551 template % (content_re, property_re),
ab2d5247 552 ]
46720279 553
3c4e6d83 554 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 555 if name is None:
3c4e6d83 556 name = 'OpenGraph %s' % prop
ab2d5247 557 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
558 if escaped is None:
559 return None
560 return unescapeHTML(escaped)
46720279
JMF
561
562 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 563 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
564
565 def _og_search_description(self, html, **kargs):
566 return self._og_search_property('description', html, fatal=False, **kargs)
567
568 def _og_search_title(self, html, **kargs):
569 return self._og_search_property('title', html, **kargs)
570
8ffa13e0 571 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
572 regexes = self._og_regexes('video') + self._og_regexes('video:url')
573 if secure:
574 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 575 return self._html_search_regex(regexes, html, name, **kargs)
46720279 576
78338f71
JMF
577 def _og_search_url(self, html, **kargs):
578 return self._og_search_property('url', html, **kargs)
579
40c696e5 580 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
581 if display_name is None:
582 display_name = name
583 return self._html_search_regex(
aaebed13 584 r'''(?ix)<meta
711ede6e
PH
585 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
586 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
587 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
588
589 def _dc_search_uploader(self, html):
590 return self._html_search_meta('dc.creator', html, 'uploader')
591
8dbe9899
PH
592 def _rta_search(self, html):
593 # See http://www.rtalabel.org/index.php?content=howtofaq#single
594 if re.search(r'(?ix)<meta\s+name="rating"\s+'
595 r' content="RTA-5042-1996-1400-1577-RTA"',
596 html):
597 return 18
598 return 0
599
59040888
PH
600 def _media_rating_search(self, html):
601 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
602 rating = self._html_search_meta('rating', html)
603
604 if not rating:
605 return None
606
607 RATING_TABLE = {
608 'safe for kids': 0,
609 'general': 8,
610 '14 years': 14,
611 'mature': 17,
612 'restricted': 19,
613 }
614 return RATING_TABLE.get(rating.lower(), None)
615
0c708f11
JMF
616 def _twitter_search_player(self, html):
617 return self._html_search_meta('twitter:player', html,
9e1a5b84 618 'twitter card player')
0c708f11 619
4bcc7bd1 620 def _sort_formats(self, formats):
7e8caf30 621 if not formats:
f1a9d64e 622 raise ExtractorError('No video formats found')
7e8caf30 623
4bcc7bd1 624 def _formats_key(f):
e6812ac9
PH
625 # TODO remove the following workaround
626 from ..utils import determine_ext
627 if not f.get('ext') and 'url' in f:
628 f['ext'] = determine_ext(f['url'])
629
4bcc7bd1
PH
630 preference = f.get('preference')
631 if preference is None:
c7deaa4c
PH
632 proto = f.get('protocol')
633 if proto is None:
634 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
635
636 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
637 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
638 preference -= 0.5
639
640 if f.get('vcodec') == 'none': # audio only
641 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 642 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 643 else:
f1a9d64e 644 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
645 ext_preference = 0
646 try:
647 audio_ext_preference = ORDER.index(f['ext'])
648 except ValueError:
649 audio_ext_preference = -1
650 else:
651 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 652 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 653 else:
f1a9d64e 654 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
655 try:
656 ext_preference = ORDER.index(f['ext'])
657 except ValueError:
658 ext_preference = -1
659 audio_ext_preference = 0
660
661 return (
662 preference,
aff2f4f4 663 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 664 f.get('quality') if f.get('quality') is not None else -1,
4bcc7bd1
PH
665 f.get('height') if f.get('height') is not None else -1,
666 f.get('width') if f.get('width') is not None else -1,
667 ext_preference,
9933b574 668 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1
PH
669 f.get('vbr') if f.get('vbr') is not None else -1,
670 f.get('abr') if f.get('abr') is not None else -1,
671 audio_ext_preference,
2c8e03d9 672 f.get('fps') if f.get('fps') is not None else -1,
4bcc7bd1 673 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 674 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 675 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
676 f.get('format_id'),
677 )
678 formats.sort(key=_formats_key)
59040888 679
20991253 680 def http_scheme(self):
1ede5b24 681 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
682 return (
683 'http:'
684 if self._downloader.params.get('prefer_insecure', False)
685 else 'https:')
686
57c7411f
PH
687 def _proto_relative_url(self, url, scheme=None):
688 if url is None:
689 return url
690 if url.startswith('//'):
691 if scheme is None:
692 scheme = self.http_scheme()
693 return scheme + url
694 else:
695 return url
696
4094b6e3
PH
697 def _sleep(self, timeout, video_id, msg_template=None):
698 if msg_template is None:
f1a9d64e 699 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
700 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
701 self.to_screen(msg)
702 time.sleep(timeout)
703
31bb8d3f 704 def _extract_f4m_formats(self, manifest_url, video_id):
f036a632
JMF
705 manifest = self._download_xml(
706 manifest_url, video_id, 'Downloading f4m manifest',
707 'Unable to download f4m manifest')
31bb8d3f
JMF
708
709 formats = []
b2527359
PH
710 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
711 for i, media_el in enumerate(media_nodes):
712 tbr = int_or_none(media_el.attrib.get('bitrate'))
713 format_id = 'f4m-%d' % (i if tbr is None else tbr)
31bb8d3f 714 formats.append({
b2527359 715 'format_id': format_id,
31bb8d3f
JMF
716 'url': manifest_url,
717 'ext': 'flv',
b2527359 718 'tbr': tbr,
31bb8d3f
JMF
719 'width': int_or_none(media_el.attrib.get('width')),
720 'height': int_or_none(media_el.attrib.get('height')),
721 })
722 self._sort_formats(formats)
723
724 return formats
725
f0b5d6af
PH
726 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
727 entry_protocol='m3u8', preference=None):
728
704df56d
PH
729 formats = [{
730 'format_id': 'm3u8-meta',
731 'url': m3u8_url,
732 'ext': ext,
733 'protocol': 'm3u8',
734 'preference': -1,
735 'resolution': 'multiple',
736 'format_note': 'Quality selection URL',
737 }]
738
f0b5d6af
PH
739 format_url = lambda u: (
740 u
741 if re.match(r'^https?://', u)
742 else compat_urlparse.urljoin(m3u8_url, u))
743
81515ad9
PH
744 m3u8_doc = self._download_webpage(
745 m3u8_url, video_id,
746 note='Downloading m3u8 information',
747 errnote='Failed to download m3u8 information')
704df56d
PH
748 last_info = None
749 kv_rex = re.compile(
750 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
751 for line in m3u8_doc.splitlines():
752 if line.startswith('#EXT-X-STREAM-INF:'):
753 last_info = {}
754 for m in kv_rex.finditer(line):
755 v = m.group('val')
756 if v.startswith('"'):
757 v = v[1:-1]
758 last_info[m.group('key')] = v
759 elif line.startswith('#') or not line.strip():
760 continue
761 else:
daebaab6 762 if last_info is None:
f0b5d6af 763 formats.append({'url': format_url(line)})
3524cc25 764 continue
704df56d
PH
765 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
766
767 f = {
768 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
f0b5d6af 769 'url': format_url(line.strip()),
704df56d
PH
770 'tbr': tbr,
771 'ext': ext,
f0b5d6af
PH
772 'protocol': entry_protocol,
773 'preference': preference,
704df56d
PH
774 }
775 codecs = last_info.get('CODECS')
776 if codecs:
9ebf22b7
S
777 # TODO: looks like video codec is not always necessarily goes first
778 va_codecs = codecs.split(',')
779 if va_codecs[0]:
780 f['vcodec'] = va_codecs[0].partition('.')[0]
781 if len(va_codecs) > 1 and va_codecs[1]:
782 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
783 resolution = last_info.get('RESOLUTION')
784 if resolution:
785 width_str, height_str = resolution.split('x')
786 f['width'] = int(width_str)
787 f['height'] = int(height_str)
788 formats.append(f)
789 last_info = {}
790 self._sort_formats(formats)
791 return formats
792
f4b1c7ad
PH
793 def _live_title(self, name):
794 """ Generate the title for a live video """
795 now = datetime.datetime.now()
796 now_str = now.strftime("%Y-%m-%d %H:%M")
797 return name + ' ' + now_str
798
b14f3a4c
PH
799 def _int(self, v, name, fatal=False, **kwargs):
800 res = int_or_none(v, **kwargs)
801 if 'get_attr' in kwargs:
802 print(getattr(v, kwargs['get_attr']))
803 if res is None:
804 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
805 if fatal:
806 raise ExtractorError(msg)
807 else:
808 self._downloader.report_warning(msg)
809 return res
810
811 def _float(self, v, name, fatal=False, **kwargs):
812 res = float_or_none(v, **kwargs)
813 if res is None:
814 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
815 if fatal:
816 raise ExtractorError(msg)
817 else:
818 self._downloader.report_warning(msg)
819 return res
820
42939b61 821 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
822 cookie = compat_cookiejar.Cookie(
823 0, name, value, None, None, domain, None,
42939b61
JMF
824 None, '/', True, False, expire_time, '', None, None, None)
825 self._downloader.cookiejar.set_cookie(cookie)
826
8dbe9899 827
d6983cb4
PH
828class SearchInfoExtractor(InfoExtractor):
829 """
830 Base class for paged search queries extractors.
831 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
832 Instances should define _SEARCH_KEY and _MAX_RESULTS.
833 """
834
835 @classmethod
836 def _make_valid_url(cls):
837 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
838
839 @classmethod
840 def suitable(cls, url):
841 return re.match(cls._make_valid_url(), url) is not None
842
843 def _real_extract(self, query):
844 mobj = re.match(self._make_valid_url(), query)
845 if mobj is None:
f1a9d64e 846 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
847
848 prefix = mobj.group('prefix')
849 query = mobj.group('query')
850 if prefix == '':
851 return self._get_n_results(query, 1)
852 elif prefix == 'all':
853 return self._get_n_results(query, self._MAX_RESULTS)
854 else:
855 n = int(prefix)
856 if n <= 0:
f1a9d64e 857 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 858 elif n > self._MAX_RESULTS:
f1a9d64e 859 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
860 n = self._MAX_RESULTS
861 return self._get_n_results(query, n)
862
863 def _get_n_results(self, query, n):
864 """Get a specified number of results for a query"""
416a5efc 865 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
866
867 @property
868 def SEARCH_KEY(self):
869 return self._SEARCH_KEY