]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
5a2b7a7219ab2070bd997803478dbbae8a55554f
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13
14 from ..compat import (
15 compat_cookiejar,
16 compat_cookies,
17 compat_getpass,
18 compat_http_client,
19 compat_urllib_error,
20 compat_urllib_parse,
21 compat_urlparse,
22 compat_str,
23 compat_etree_fromstring,
24 )
25 from ..utils import (
26 NO_DEFAULT,
27 age_restricted,
28 bug_reports_message,
29 clean_html,
30 compiled_regex_type,
31 determine_ext,
32 error_to_compat_str,
33 ExtractorError,
34 fix_xml_ampersands,
35 float_or_none,
36 int_or_none,
37 parse_iso8601,
38 RegexNotFoundError,
39 sanitize_filename,
40 sanitized_Request,
41 unescapeHTML,
42 unified_strdate,
43 url_basename,
44 xpath_text,
45 xpath_with_ns,
46 determine_protocol,
47 )
48
49
50 class InfoExtractor(object):
51 """Information Extractor class.
52
53 Information extractors are the classes that, given a URL, extract
54 information about the video (or videos) the URL refers to. This
55 information includes the real video URL, the video title, author and
56 others. The information is stored in a dictionary which is then
57 passed to the YoutubeDL. The YoutubeDL processes this
58 information possibly downloading the video to the file system, among
59 other possible outcomes.
60
61 The type field determines the type of the result.
62 By far the most common value (and the default if _type is missing) is
63 "video", which indicates a single video.
64
65 For a video, the dictionaries must include the following fields:
66
67 id: Video identifier.
68 title: Video title, unescaped.
69
70 Additionally, it must contain either a formats entry or a url one:
71
72 formats: A list of dictionaries for each format available, ordered
73 from worst to best quality.
74
75 Potential fields:
76 * url Mandatory. The URL of the video file
77 * ext Will be calculated from URL if missing
78 * format A human-readable description of the format
79 ("mp4 container with h264/opus").
80 Calculated from the format_id, width, height.
81 and format_note fields if missing.
82 * format_id A short description of the format
83 ("mp4_h264_opus" or "19").
84 Technically optional, but strongly recommended.
85 * format_note Additional info about the format
86 ("3D" or "DASH video")
87 * width Width of the video, if known
88 * height Height of the video, if known
89 * resolution Textual description of width and height
90 * tbr Average bitrate of audio and video in KBit/s
91 * abr Average audio bitrate in KBit/s
92 * acodec Name of the audio codec in use
93 * asr Audio sampling rate in Hertz
94 * vbr Average video bitrate in KBit/s
95 * fps Frame rate
96 * vcodec Name of the video codec in use
97 * container Name of the container format
98 * filesize The number of bytes, if known in advance
99 * filesize_approx An estimate for the number of bytes
100 * player_url SWF Player URL (used for rtmpdump).
101 * protocol The protocol that will be used for the actual
102 download, lower-case.
103 "http", "https", "rtsp", "rtmp", "rtmpe",
104 "m3u8", or "m3u8_native".
105 * preference Order number of this format. If this field is
106 present and not None, the formats get sorted
107 by this field, regardless of all other values.
108 -1 for default (order by other properties),
109 -2 or smaller for less than default.
110 < -1000 to hide the format (if there is
111 another one which is strictly better)
112 * language Language code, e.g. "de" or "en-US".
113 * language_preference Is this in the language mentioned in
114 the URL?
115 10 if it's what the URL is about,
116 -1 for default (don't know),
117 -10 otherwise, other values reserved for now.
118 * quality Order number of the video quality of this
119 format, irrespective of the file format.
120 -1 for default (order by other properties),
121 -2 or smaller for less than default.
122 * source_preference Order number for this video source
123 (quality takes higher priority)
124 -1 for default (order by other properties),
125 -2 or smaller for less than default.
126 * http_headers A dictionary of additional HTTP headers
127 to add to the request.
128 * stretched_ratio If given and not 1, indicates that the
129 video's pixels are not square.
130 width : height ratio as float.
131 * no_resume The server does not support resuming the
132 (HTTP or RTMP) download. Boolean.
133
134 url: Final video URL.
135 ext: Video filename extension.
136 format: The video format, defaults to ext (used for --get-format)
137 player_url: SWF Player URL (used for rtmpdump).
138
139 The following fields are optional:
140
141 alt_title: A secondary title of the video.
142 display_id An alternative identifier for the video, not necessarily
143 unique, but available before title. Typically, id is
144 something like "4234987", title "Dancing naked mole rats",
145 and display_id "dancing-naked-mole-rats"
146 thumbnails: A list of dictionaries, with the following entries:
147 * "id" (optional, string) - Thumbnail format ID
148 * "url"
149 * "preference" (optional, int) - quality of the image
150 * "width" (optional, int)
151 * "height" (optional, int)
152 * "resolution" (optional, string "{width}x{height"},
153 deprecated)
154 thumbnail: Full URL to a video thumbnail image.
155 description: Full video description.
156 uploader: Full name of the video uploader.
157 creator: The main artist who created the video.
158 release_date: The date (YYYYMMDD) when the video was released.
159 timestamp: UNIX timestamp of the moment the video became available.
160 upload_date: Video upload date (YYYYMMDD).
161 If not explicitly set, calculated from timestamp.
162 uploader_id: Nickname or id of the video uploader.
163 location: Physical location where the video was filmed.
164 subtitles: The available subtitles as a dictionary in the format
165 {language: subformats}. "subformats" is a list sorted from
166 lower to higher preference, each element is a dictionary
167 with the "ext" entry and one of:
168 * "data": The subtitles file contents
169 * "url": A URL pointing to the subtitles file
170 "ext" will be calculated from URL if missing
171 automatic_captions: Like 'subtitles', used by the YoutubeIE for
172 automatically generated captions
173 duration: Length of the video in seconds, as an integer or float.
174 view_count: How many users have watched the video on the platform.
175 like_count: Number of positive ratings of the video
176 dislike_count: Number of negative ratings of the video
177 repost_count: Number of reposts of the video
178 average_rating: Average rating give by users, the scale used depends on the webpage
179 comment_count: Number of comments on the video
180 comments: A list of comments, each with one or more of the following
181 properties (all but one of text or html optional):
182 * "author" - human-readable name of the comment author
183 * "author_id" - user ID of the comment author
184 * "id" - Comment ID
185 * "html" - Comment as HTML
186 * "text" - Plain text of the comment
187 * "timestamp" - UNIX timestamp of comment
188 * "parent" - ID of the comment this one is replying to.
189 Set to "root" to indicate that this is a
190 comment to the original video.
191 age_limit: Age restriction for the video, as an integer (years)
192 webpage_url: The URL to the video webpage, if given to youtube-dl it
193 should allow to get the same result again. (It will be set
194 by YoutubeDL if it's missing)
195 categories: A list of categories that the video falls in, for example
196 ["Sports", "Berlin"]
197 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
198 is_live: True, False, or None (=unknown). Whether this video is a
199 live stream that goes on instead of a fixed-length video.
200 start_time: Time in seconds where the reproduction should start, as
201 specified in the URL.
202 end_time: Time in seconds where the reproduction should end, as
203 specified in the URL.
204
205 The following fields should only be used when the video belongs to some logical
206 chapter or section:
207
208 chapter: Name or title of the chapter the video belongs to.
209 chapter_number: Number of the chapter the video belongs to, as an integer.
210 chapter_id: Id of the chapter the video belongs to, as a unicode string.
211
212 The following fields should only be used when the video is an episode of some
213 series or programme:
214
215 series: Title of the series or programme the video episode belongs to.
216 season: Title of the season the video episode belongs to.
217 season_number: Number of the season the video episode belongs to, as an integer.
218 season_id: Id of the season the video episode belongs to, as a unicode string.
219 episode: Title of the video episode. Unlike mandatory video title field,
220 this field should denote the exact title of the video episode
221 without any kind of decoration.
222 episode_number: Number of the video episode within a season, as an integer.
223 episode_id: Id of the video episode, as a unicode string.
224
225 Unless mentioned otherwise, the fields should be Unicode strings.
226
227 Unless mentioned otherwise, None is equivalent to absence of information.
228
229
230 _type "playlist" indicates multiple videos.
231 There must be a key "entries", which is a list, an iterable, or a PagedList
232 object, each element of which is a valid dictionary by this specification.
233
234 Additionally, playlists can have "title", "description" and "id" attributes
235 with the same semantics as videos (see above).
236
237
238 _type "multi_video" indicates that there are multiple videos that
239 form a single show, for examples multiple acts of an opera or TV episode.
240 It must have an entries key like a playlist and contain all the keys
241 required for a video at the same time.
242
243
244 _type "url" indicates that the video must be extracted from another
245 location, possibly by a different extractor. Its only required key is:
246 "url" - the next URL to extract.
247 The key "ie_key" can be set to the class name (minus the trailing "IE",
248 e.g. "Youtube") if the extractor class is known in advance.
249 Additionally, the dictionary may have any properties of the resolved entity
250 known in advance, for example "title" if the title of the referred video is
251 known ahead of time.
252
253
254 _type "url_transparent" entities have the same specification as "url", but
255 indicate that the given additional information is more precise than the one
256 associated with the resolved URL.
257 This is useful when a site employs a video service that hosts the video and
258 its technical metadata, but that video service does not embed a useful
259 title, description etc.
260
261
262 Subclasses of this one should re-define the _real_initialize() and
263 _real_extract() methods and define a _VALID_URL regexp.
264 Probably, they should also be added to the list of extractors.
265
266 Finally, the _WORKING attribute should be set to False for broken IEs
267 in order to warn the users and skip the tests.
268 """
269
270 _ready = False
271 _downloader = None
272 _WORKING = True
273
274 def __init__(self, downloader=None):
275 """Constructor. Receives an optional downloader."""
276 self._ready = False
277 self.set_downloader(downloader)
278
279 @classmethod
280 def suitable(cls, url):
281 """Receives a URL and returns True if suitable for this IE."""
282
283 # This does not use has/getattr intentionally - we want to know whether
284 # we have cached the regexp for *this* class, whereas getattr would also
285 # match the superclass
286 if '_VALID_URL_RE' not in cls.__dict__:
287 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
288 return cls._VALID_URL_RE.match(url) is not None
289
290 @classmethod
291 def _match_id(cls, url):
292 if '_VALID_URL_RE' not in cls.__dict__:
293 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
294 m = cls._VALID_URL_RE.match(url)
295 assert m
296 return m.group('id')
297
298 @classmethod
299 def working(cls):
300 """Getter method for _WORKING."""
301 return cls._WORKING
302
303 def initialize(self):
304 """Initializes an instance (authentication, etc)."""
305 if not self._ready:
306 self._real_initialize()
307 self._ready = True
308
309 def extract(self, url):
310 """Extracts URL information and returns it in list of dicts."""
311 try:
312 self.initialize()
313 return self._real_extract(url)
314 except ExtractorError:
315 raise
316 except compat_http_client.IncompleteRead as e:
317 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
318 except (KeyError, StopIteration) as e:
319 raise ExtractorError('An extractor error has occurred.', cause=e)
320
321 def set_downloader(self, downloader):
322 """Sets the downloader for this IE."""
323 self._downloader = downloader
324
325 def _real_initialize(self):
326 """Real initialization process. Redefine in subclasses."""
327 pass
328
329 def _real_extract(self, url):
330 """Real extraction process. Redefine in subclasses."""
331 pass
332
333 @classmethod
334 def ie_key(cls):
335 """A string for getting the InfoExtractor with get_info_extractor"""
336 return compat_str(cls.__name__[:-2])
337
338 @property
339 def IE_NAME(self):
340 return compat_str(type(self).__name__[:-2])
341
342 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
343 """ Returns the response handle """
344 if note is None:
345 self.report_download_webpage(video_id)
346 elif note is not False:
347 if video_id is None:
348 self.to_screen('%s' % (note,))
349 else:
350 self.to_screen('%s: %s' % (video_id, note))
351 try:
352 return self._downloader.urlopen(url_or_request)
353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354 if errnote is False:
355 return False
356 if errnote is None:
357 errnote = 'Unable to download webpage'
358
359 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
360 if fatal:
361 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
362 else:
363 self._downloader.report_warning(errmsg)
364 return False
365
366 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
367 """ Returns a tuple (page content as string, URL handle) """
368 # Strip hashes from the URL (#1038)
369 if isinstance(url_or_request, (compat_str, str)):
370 url_or_request = url_or_request.partition('#')[0]
371
372 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
373 if urlh is False:
374 assert not fatal
375 return False
376 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
377 return (content, urlh)
378
379 @staticmethod
380 def _guess_encoding_from_content(content_type, webpage_bytes):
381 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
382 if m:
383 encoding = m.group(1)
384 else:
385 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
386 webpage_bytes[:1024])
387 if m:
388 encoding = m.group(1).decode('ascii')
389 elif webpage_bytes.startswith(b'\xff\xfe'):
390 encoding = 'utf-16'
391 else:
392 encoding = 'utf-8'
393
394 return encoding
395
396 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
397 content_type = urlh.headers.get('Content-Type', '')
398 webpage_bytes = urlh.read()
399 if prefix is not None:
400 webpage_bytes = prefix + webpage_bytes
401 if not encoding:
402 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
403 if self._downloader.params.get('dump_intermediate_pages', False):
404 try:
405 url = url_or_request.get_full_url()
406 except AttributeError:
407 url = url_or_request
408 self.to_screen('Dumping request to ' + url)
409 dump = base64.b64encode(webpage_bytes).decode('ascii')
410 self._downloader.to_screen(dump)
411 if self._downloader.params.get('write_pages', False):
412 try:
413 url = url_or_request.get_full_url()
414 except AttributeError:
415 url = url_or_request
416 basen = '%s_%s' % (video_id, url)
417 if len(basen) > 240:
418 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
419 basen = basen[:240 - len(h)] + h
420 raw_filename = basen + '.dump'
421 filename = sanitize_filename(raw_filename, restricted=True)
422 self.to_screen('Saving request to ' + filename)
423 # Working around MAX_PATH limitation on Windows (see
424 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
425 if os.name == 'nt':
426 absfilepath = os.path.abspath(filename)
427 if len(absfilepath) > 259:
428 filename = '\\\\?\\' + absfilepath
429 with open(filename, 'wb') as outf:
430 outf.write(webpage_bytes)
431
432 try:
433 content = webpage_bytes.decode(encoding, 'replace')
434 except LookupError:
435 content = webpage_bytes.decode('utf-8', 'replace')
436
437 if ('<title>Access to this site is blocked</title>' in content and
438 'Websense' in content[:512]):
439 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
440 blocked_iframe = self._html_search_regex(
441 r'<iframe src="([^"]+)"', content,
442 'Websense information URL', default=None)
443 if blocked_iframe:
444 msg += ' Visit %s for more details' % blocked_iframe
445 raise ExtractorError(msg, expected=True)
446 if '<title>The URL you requested has been blocked</title>' in content[:512]:
447 msg = (
448 'Access to this webpage has been blocked by Indian censorship. '
449 'Use a VPN or proxy server (with --proxy) to route around it.')
450 block_msg = self._html_search_regex(
451 r'</h1><p>(.*?)</p>',
452 content, 'block message', default=None)
453 if block_msg:
454 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
455 raise ExtractorError(msg, expected=True)
456
457 return content
458
459 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
460 """ Returns the data of the page as a string """
461 success = False
462 try_count = 0
463 while success is False:
464 try:
465 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
466 success = True
467 except compat_http_client.IncompleteRead as e:
468 try_count += 1
469 if try_count >= tries:
470 raise e
471 self._sleep(timeout, video_id)
472 if res is False:
473 return res
474 else:
475 content, _ = res
476 return content
477
478 def _download_xml(self, url_or_request, video_id,
479 note='Downloading XML', errnote='Unable to download XML',
480 transform_source=None, fatal=True, encoding=None):
481 """Return the xml as an xml.etree.ElementTree.Element"""
482 xml_string = self._download_webpage(
483 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
484 if xml_string is False:
485 return xml_string
486 if transform_source:
487 xml_string = transform_source(xml_string)
488 return compat_etree_fromstring(xml_string.encode('utf-8'))
489
490 def _download_json(self, url_or_request, video_id,
491 note='Downloading JSON metadata',
492 errnote='Unable to download JSON metadata',
493 transform_source=None,
494 fatal=True, encoding=None):
495 json_string = self._download_webpage(
496 url_or_request, video_id, note, errnote, fatal=fatal,
497 encoding=encoding)
498 if (not fatal) and json_string is False:
499 return None
500 return self._parse_json(
501 json_string, video_id, transform_source=transform_source, fatal=fatal)
502
503 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
504 if transform_source:
505 json_string = transform_source(json_string)
506 try:
507 return json.loads(json_string)
508 except ValueError as ve:
509 errmsg = '%s: Failed to parse JSON ' % video_id
510 if fatal:
511 raise ExtractorError(errmsg, cause=ve)
512 else:
513 self.report_warning(errmsg + str(ve))
514
515 def report_warning(self, msg, video_id=None):
516 idstr = '' if video_id is None else '%s: ' % video_id
517 self._downloader.report_warning(
518 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
519
520 def to_screen(self, msg):
521 """Print msg to screen, prefixing it with '[ie_name]'"""
522 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
523
524 def report_extraction(self, id_or_name):
525 """Report information extraction."""
526 self.to_screen('%s: Extracting information' % id_or_name)
527
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self.to_screen('%s: Downloading webpage' % video_id)
531
532 def report_age_confirmation(self):
533 """Report attempt to confirm age."""
534 self.to_screen('Confirming age')
535
536 def report_login(self):
537 """Report attempt to log in."""
538 self.to_screen('Logging in')
539
540 @staticmethod
541 def raise_login_required(msg='This video is only available for registered users'):
542 raise ExtractorError(
543 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
544 expected=True)
545
546 @staticmethod
547 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
548 raise ExtractorError(
549 '%s. You might want to use --proxy to workaround.' % msg,
550 expected=True)
551
552 # Methods for following #608
553 @staticmethod
554 def url_result(url, ie=None, video_id=None, video_title=None):
555 """Returns a URL that points to a page that should be processed"""
556 # TODO: ie should be the class used for getting the info
557 video_info = {'_type': 'url',
558 'url': url,
559 'ie_key': ie}
560 if video_id is not None:
561 video_info['id'] = video_id
562 if video_title is not None:
563 video_info['title'] = video_title
564 return video_info
565
566 @staticmethod
567 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
568 """Returns a playlist"""
569 video_info = {'_type': 'playlist',
570 'entries': entries}
571 if playlist_id:
572 video_info['id'] = playlist_id
573 if playlist_title:
574 video_info['title'] = playlist_title
575 if playlist_description:
576 video_info['description'] = playlist_description
577 return video_info
578
579 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
580 """
581 Perform a regex search on the given string, using a single or a list of
582 patterns returning the first matching group.
583 In case of failure return a default value or raise a WARNING or a
584 RegexNotFoundError, depending on fatal, specifying the field name.
585 """
586 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
587 mobj = re.search(pattern, string, flags)
588 else:
589 for p in pattern:
590 mobj = re.search(p, string, flags)
591 if mobj:
592 break
593
594 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
595 _name = '\033[0;34m%s\033[0m' % name
596 else:
597 _name = name
598
599 if mobj:
600 if group is None:
601 # return the first matching group
602 return next(g for g in mobj.groups() if g is not None)
603 else:
604 return mobj.group(group)
605 elif default is not NO_DEFAULT:
606 return default
607 elif fatal:
608 raise RegexNotFoundError('Unable to extract %s' % _name)
609 else:
610 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
611 return None
612
613 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
614 """
615 Like _search_regex, but strips HTML tags and unescapes entities.
616 """
617 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
618 if res:
619 return clean_html(res).strip()
620 else:
621 return res
622
623 def _get_login_info(self):
624 """
625 Get the login info as (username, password)
626 It will look in the netrc file using the _NETRC_MACHINE value
627 If there's no info available, return (None, None)
628 """
629 if self._downloader is None:
630 return (None, None)
631
632 username = None
633 password = None
634 downloader_params = self._downloader.params
635
636 # Attempt to use provided username and password or .netrc data
637 if downloader_params.get('username', None) is not None:
638 username = downloader_params['username']
639 password = downloader_params['password']
640 elif downloader_params.get('usenetrc', False):
641 try:
642 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
643 if info is not None:
644 username = info[0]
645 password = info[2]
646 else:
647 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
648 except (IOError, netrc.NetrcParseError) as err:
649 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
650
651 return (username, password)
652
653 def _get_tfa_info(self, note='two-factor verification code'):
654 """
655 Get the two-factor authentication info
656 TODO - asking the user will be required for sms/phone verify
657 currently just uses the command line option
658 If there's no info available, return None
659 """
660 if self._downloader is None:
661 return None
662 downloader_params = self._downloader.params
663
664 if downloader_params.get('twofactor', None) is not None:
665 return downloader_params['twofactor']
666
667 return compat_getpass('Type %s and press [Return]: ' % note)
668
669 # Helper functions for extracting OpenGraph info
670 @staticmethod
671 def _og_regexes(prop):
672 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
673 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
674 % {'prop': re.escape(prop)})
675 template = r'<meta[^>]+?%s[^>]+?%s'
676 return [
677 template % (property_re, content_re),
678 template % (content_re, property_re),
679 ]
680
681 @staticmethod
682 def _meta_regex(prop):
683 return r'''(?isx)<meta
684 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
685 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
686
687 def _og_search_property(self, prop, html, name=None, **kargs):
688 if name is None:
689 name = 'OpenGraph %s' % prop
690 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
691 if escaped is None:
692 return None
693 return unescapeHTML(escaped)
694
695 def _og_search_thumbnail(self, html, **kargs):
696 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
697
698 def _og_search_description(self, html, **kargs):
699 return self._og_search_property('description', html, fatal=False, **kargs)
700
701 def _og_search_title(self, html, **kargs):
702 return self._og_search_property('title', html, **kargs)
703
704 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
705 regexes = self._og_regexes('video') + self._og_regexes('video:url')
706 if secure:
707 regexes = self._og_regexes('video:secure_url') + regexes
708 return self._html_search_regex(regexes, html, name, **kargs)
709
710 def _og_search_url(self, html, **kargs):
711 return self._og_search_property('url', html, **kargs)
712
713 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
714 if display_name is None:
715 display_name = name
716 return self._html_search_regex(
717 self._meta_regex(name),
718 html, display_name, fatal=fatal, group='content', **kwargs)
719
720 def _dc_search_uploader(self, html):
721 return self._html_search_meta('dc.creator', html, 'uploader')
722
723 def _rta_search(self, html):
724 # See http://www.rtalabel.org/index.php?content=howtofaq#single
725 if re.search(r'(?ix)<meta\s+name="rating"\s+'
726 r' content="RTA-5042-1996-1400-1577-RTA"',
727 html):
728 return 18
729 return 0
730
731 def _media_rating_search(self, html):
732 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
733 rating = self._html_search_meta('rating', html)
734
735 if not rating:
736 return None
737
738 RATING_TABLE = {
739 'safe for kids': 0,
740 'general': 8,
741 '14 years': 14,
742 'mature': 17,
743 'restricted': 19,
744 }
745 return RATING_TABLE.get(rating.lower(), None)
746
747 def _family_friendly_search(self, html):
748 # See http://schema.org/VideoObject
749 family_friendly = self._html_search_meta('isFamilyFriendly', html)
750
751 if not family_friendly:
752 return None
753
754 RATING_TABLE = {
755 '1': 0,
756 'true': 0,
757 '0': 18,
758 'false': 18,
759 }
760 return RATING_TABLE.get(family_friendly.lower(), None)
761
762 def _twitter_search_player(self, html):
763 return self._html_search_meta('twitter:player', html,
764 'twitter card player')
765
766 def _search_json_ld(self, html, video_id, **kwargs):
767 json_ld = self._search_regex(
768 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
769 html, 'JSON-LD', group='json_ld', **kwargs)
770 if not json_ld:
771 return {}
772 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
773
774 def _json_ld(self, json_ld, video_id, fatal=True):
775 if isinstance(json_ld, compat_str):
776 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
777 if not json_ld:
778 return {}
779 info = {}
780 if json_ld.get('@context') == 'http://schema.org':
781 item_type = json_ld.get('@type')
782 if item_type == 'TVEpisode':
783 info.update({
784 'episode': unescapeHTML(json_ld.get('name')),
785 'episode_number': int_or_none(json_ld.get('episodeNumber')),
786 'description': unescapeHTML(json_ld.get('description')),
787 })
788 part_of_season = json_ld.get('partOfSeason')
789 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
790 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
791 part_of_series = json_ld.get('partOfSeries')
792 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
793 info['series'] = unescapeHTML(part_of_series.get('name'))
794 elif item_type == 'Article':
795 info.update({
796 'timestamp': parse_iso8601(json_ld.get('datePublished')),
797 'title': unescapeHTML(json_ld.get('headline')),
798 'description': unescapeHTML(json_ld.get('articleBody')),
799 })
800 return dict((k, v) for k, v in info.items() if v is not None)
801
802 @staticmethod
803 def _hidden_inputs(html):
804 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
805 hidden_inputs = {}
806 for input in re.findall(r'(?i)<input([^>]+)>', html):
807 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
808 continue
809 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
810 if not name:
811 continue
812 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
813 if not value:
814 continue
815 hidden_inputs[name.group('value')] = value.group('value')
816 return hidden_inputs
817
818 def _form_hidden_inputs(self, form_id, html):
819 form = self._search_regex(
820 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
821 html, '%s form' % form_id, group='form')
822 return self._hidden_inputs(form)
823
824 def _sort_formats(self, formats, field_preference=None):
825 if not formats:
826 raise ExtractorError('No video formats found')
827
828 for f in formats:
829 # Automatically determine tbr when missing based on abr and vbr (improves
830 # formats sorting in some cases)
831 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
832 f['tbr'] = f['abr'] + f['vbr']
833
834 def _formats_key(f):
835 # TODO remove the following workaround
836 from ..utils import determine_ext
837 if not f.get('ext') and 'url' in f:
838 f['ext'] = determine_ext(f['url'])
839
840 if isinstance(field_preference, (list, tuple)):
841 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
842
843 preference = f.get('preference')
844 if preference is None:
845 preference = 0
846 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
847 preference -= 0.5
848
849 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
850
851 if f.get('vcodec') == 'none': # audio only
852 if self._downloader.params.get('prefer_free_formats'):
853 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
854 else:
855 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
856 ext_preference = 0
857 try:
858 audio_ext_preference = ORDER.index(f['ext'])
859 except ValueError:
860 audio_ext_preference = -1
861 else:
862 if self._downloader.params.get('prefer_free_formats'):
863 ORDER = ['flv', 'mp4', 'webm']
864 else:
865 ORDER = ['webm', 'flv', 'mp4']
866 try:
867 ext_preference = ORDER.index(f['ext'])
868 except ValueError:
869 ext_preference = -1
870 audio_ext_preference = 0
871
872 return (
873 preference,
874 f.get('language_preference') if f.get('language_preference') is not None else -1,
875 f.get('quality') if f.get('quality') is not None else -1,
876 f.get('tbr') if f.get('tbr') is not None else -1,
877 f.get('filesize') if f.get('filesize') is not None else -1,
878 f.get('vbr') if f.get('vbr') is not None else -1,
879 f.get('height') if f.get('height') is not None else -1,
880 f.get('width') if f.get('width') is not None else -1,
881 proto_preference,
882 ext_preference,
883 f.get('abr') if f.get('abr') is not None else -1,
884 audio_ext_preference,
885 f.get('fps') if f.get('fps') is not None else -1,
886 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
887 f.get('source_preference') if f.get('source_preference') is not None else -1,
888 f.get('format_id') if f.get('format_id') is not None else '',
889 )
890 formats.sort(key=_formats_key)
891
892 def _check_formats(self, formats, video_id):
893 if formats:
894 formats[:] = filter(
895 lambda f: self._is_valid_url(
896 f['url'], video_id,
897 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
898 formats)
899
900 def _is_valid_url(self, url, video_id, item='video'):
901 url = self._proto_relative_url(url, scheme='http:')
902 # For now assume non HTTP(S) URLs always valid
903 if not (url.startswith('http://') or url.startswith('https://')):
904 return True
905 try:
906 self._request_webpage(url, video_id, 'Checking %s URL' % item)
907 return True
908 except ExtractorError as e:
909 if isinstance(e.cause, compat_urllib_error.URLError):
910 self.to_screen(
911 '%s: %s URL is invalid, skipping' % (video_id, item))
912 return False
913 raise
914
915 def http_scheme(self):
916 """ Either "http:" or "https:", depending on the user's preferences """
917 return (
918 'http:'
919 if self._downloader.params.get('prefer_insecure', False)
920 else 'https:')
921
922 def _proto_relative_url(self, url, scheme=None):
923 if url is None:
924 return url
925 if url.startswith('//'):
926 if scheme is None:
927 scheme = self.http_scheme()
928 return scheme + url
929 else:
930 return url
931
932 def _sleep(self, timeout, video_id, msg_template=None):
933 if msg_template is None:
934 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
935 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
936 self.to_screen(msg)
937 time.sleep(timeout)
938
939 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
940 transform_source=lambda s: fix_xml_ampersands(s).strip(),
941 fatal=True):
942 manifest = self._download_xml(
943 manifest_url, video_id, 'Downloading f4m manifest',
944 'Unable to download f4m manifest',
945 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
946 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
947 transform_source=transform_source,
948 fatal=fatal)
949
950 if manifest is False:
951 return []
952
953 formats = []
954 manifest_version = '1.0'
955 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
956 if not media_nodes:
957 manifest_version = '2.0'
958 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
959 base_url = xpath_text(
960 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
961 'base URL', default=None)
962 if base_url:
963 base_url = base_url.strip()
964 for i, media_el in enumerate(media_nodes):
965 if manifest_version == '2.0':
966 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
967 if not media_url:
968 continue
969 manifest_url = (
970 media_url if media_url.startswith('http://') or media_url.startswith('https://')
971 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
972 # If media_url is itself a f4m manifest do the recursive extraction
973 # since bitrates in parent manifest (this one) and media_url manifest
974 # may differ leading to inability to resolve the format by requested
975 # bitrate in f4m downloader
976 if determine_ext(manifest_url) == 'f4m':
977 formats.extend(self._extract_f4m_formats(
978 manifest_url, video_id, preference, f4m_id, fatal=fatal))
979 continue
980 tbr = int_or_none(media_el.attrib.get('bitrate'))
981 formats.append({
982 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
983 'url': manifest_url,
984 'ext': 'flv',
985 'tbr': tbr,
986 'width': int_or_none(media_el.attrib.get('width')),
987 'height': int_or_none(media_el.attrib.get('height')),
988 'preference': preference,
989 })
990 self._sort_formats(formats)
991
992 return formats
993
994 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
995 entry_protocol='m3u8', preference=None,
996 m3u8_id=None, note=None, errnote=None,
997 fatal=True):
998
999 formats = [{
1000 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1001 'url': m3u8_url,
1002 'ext': ext,
1003 'protocol': 'm3u8',
1004 'preference': preference - 1 if preference else -1,
1005 'resolution': 'multiple',
1006 'format_note': 'Quality selection URL',
1007 }]
1008
1009 format_url = lambda u: (
1010 u
1011 if re.match(r'^https?://', u)
1012 else compat_urlparse.urljoin(m3u8_url, u))
1013
1014 res = self._download_webpage_handle(
1015 m3u8_url, video_id,
1016 note=note or 'Downloading m3u8 information',
1017 errnote=errnote or 'Failed to download m3u8 information',
1018 fatal=fatal)
1019 if res is False:
1020 return []
1021 m3u8_doc, urlh = res
1022 m3u8_url = urlh.geturl()
1023 # A Media Playlist Tag MUST NOT appear in a Master Playlist
1024 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1025 # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1026 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1027 if '#EXT-X-TARGETDURATION' in m3u8_doc:
1028 return [{
1029 'url': m3u8_url,
1030 'format_id': m3u8_id,
1031 'ext': ext,
1032 'protocol': entry_protocol,
1033 'preference': preference,
1034 }]
1035 last_info = None
1036 last_media = None
1037 kv_rex = re.compile(
1038 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1039 for line in m3u8_doc.splitlines():
1040 if line.startswith('#EXT-X-STREAM-INF:'):
1041 last_info = {}
1042 for m in kv_rex.finditer(line):
1043 v = m.group('val')
1044 if v.startswith('"'):
1045 v = v[1:-1]
1046 last_info[m.group('key')] = v
1047 elif line.startswith('#EXT-X-MEDIA:'):
1048 last_media = {}
1049 for m in kv_rex.finditer(line):
1050 v = m.group('val')
1051 if v.startswith('"'):
1052 v = v[1:-1]
1053 last_media[m.group('key')] = v
1054 elif line.startswith('#') or not line.strip():
1055 continue
1056 else:
1057 if last_info is None:
1058 formats.append({'url': format_url(line)})
1059 continue
1060 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1061 format_id = []
1062 if m3u8_id:
1063 format_id.append(m3u8_id)
1064 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1065 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1066 f = {
1067 'format_id': '-'.join(format_id),
1068 'url': format_url(line.strip()),
1069 'tbr': tbr,
1070 'ext': ext,
1071 'protocol': entry_protocol,
1072 'preference': preference,
1073 }
1074 codecs = last_info.get('CODECS')
1075 if codecs:
1076 # TODO: looks like video codec is not always necessarily goes first
1077 va_codecs = codecs.split(',')
1078 if va_codecs[0]:
1079 f['vcodec'] = va_codecs[0]
1080 if len(va_codecs) > 1 and va_codecs[1]:
1081 f['acodec'] = va_codecs[1]
1082 resolution = last_info.get('RESOLUTION')
1083 if resolution:
1084 width_str, height_str = resolution.split('x')
1085 f['width'] = int(width_str)
1086 f['height'] = int(height_str)
1087 if last_media is not None:
1088 f['m3u8_media'] = last_media
1089 last_media = None
1090 formats.append(f)
1091 last_info = {}
1092 self._sort_formats(formats)
1093 return formats
1094
1095 @staticmethod
1096 def _xpath_ns(path, namespace=None):
1097 if not namespace:
1098 return path
1099 out = []
1100 for c in path.split('/'):
1101 if not c or c == '.':
1102 out.append(c)
1103 else:
1104 out.append('{%s}%s' % (namespace, c))
1105 return '/'.join(out)
1106
1107 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1108 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1109
1110 if smil is False:
1111 assert not fatal
1112 return []
1113
1114 namespace = self._parse_smil_namespace(smil)
1115
1116 return self._parse_smil_formats(
1117 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1118
1119 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1120 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1121 if smil is False:
1122 return {}
1123 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1124
1125 def _download_smil(self, smil_url, video_id, fatal=True):
1126 return self._download_xml(
1127 smil_url, video_id, 'Downloading SMIL file',
1128 'Unable to download SMIL file', fatal=fatal)
1129
1130 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1131 namespace = self._parse_smil_namespace(smil)
1132
1133 formats = self._parse_smil_formats(
1134 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1135 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1136
1137 video_id = os.path.splitext(url_basename(smil_url))[0]
1138 title = None
1139 description = None
1140 upload_date = None
1141 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1142 name = meta.attrib.get('name')
1143 content = meta.attrib.get('content')
1144 if not name or not content:
1145 continue
1146 if not title and name == 'title':
1147 title = content
1148 elif not description and name in ('description', 'abstract'):
1149 description = content
1150 elif not upload_date and name == 'date':
1151 upload_date = unified_strdate(content)
1152
1153 thumbnails = [{
1154 'id': image.get('type'),
1155 'url': image.get('src'),
1156 'width': int_or_none(image.get('width')),
1157 'height': int_or_none(image.get('height')),
1158 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1159
1160 return {
1161 'id': video_id,
1162 'title': title or video_id,
1163 'description': description,
1164 'upload_date': upload_date,
1165 'thumbnails': thumbnails,
1166 'formats': formats,
1167 'subtitles': subtitles,
1168 }
1169
1170 def _parse_smil_namespace(self, smil):
1171 return self._search_regex(
1172 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1173
1174 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1175 base = smil_url
1176 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1177 b = meta.get('base') or meta.get('httpBase')
1178 if b:
1179 base = b
1180 break
1181
1182 formats = []
1183 rtmp_count = 0
1184 http_count = 0
1185 m3u8_count = 0
1186
1187 videos = smil.findall(self._xpath_ns('.//video', namespace))
1188 for video in videos:
1189 src = video.get('src')
1190 if not src:
1191 continue
1192
1193 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1194 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1195 width = int_or_none(video.get('width'))
1196 height = int_or_none(video.get('height'))
1197 proto = video.get('proto')
1198 ext = video.get('ext')
1199 src_ext = determine_ext(src)
1200 streamer = video.get('streamer') or base
1201
1202 if proto == 'rtmp' or streamer.startswith('rtmp'):
1203 rtmp_count += 1
1204 formats.append({
1205 'url': streamer,
1206 'play_path': src,
1207 'ext': 'flv',
1208 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1209 'tbr': bitrate,
1210 'filesize': filesize,
1211 'width': width,
1212 'height': height,
1213 })
1214 if transform_rtmp_url:
1215 streamer, src = transform_rtmp_url(streamer, src)
1216 formats[-1].update({
1217 'url': streamer,
1218 'play_path': src,
1219 })
1220 continue
1221
1222 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1223
1224 if proto == 'm3u8' or src_ext == 'm3u8':
1225 m3u8_formats = self._extract_m3u8_formats(
1226 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1227 if len(m3u8_formats) == 1:
1228 m3u8_count += 1
1229 m3u8_formats[0].update({
1230 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1231 'tbr': bitrate,
1232 'width': width,
1233 'height': height,
1234 })
1235 formats.extend(m3u8_formats)
1236 continue
1237
1238 if src_ext == 'f4m':
1239 f4m_url = src_url
1240 if not f4m_params:
1241 f4m_params = {
1242 'hdcore': '3.2.0',
1243 'plugin': 'flowplayer-3.2.0.1',
1244 }
1245 f4m_url += '&' if '?' in f4m_url else '?'
1246 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1247 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1248 continue
1249
1250 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1251 http_count += 1
1252 formats.append({
1253 'url': src_url,
1254 'ext': ext or src_ext or 'flv',
1255 'format_id': 'http-%d' % (bitrate or http_count),
1256 'tbr': bitrate,
1257 'filesize': filesize,
1258 'width': width,
1259 'height': height,
1260 })
1261 continue
1262
1263 self._sort_formats(formats)
1264
1265 return formats
1266
1267 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1268 subtitles = {}
1269 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1270 src = textstream.get('src')
1271 if not src:
1272 continue
1273 ext = textstream.get('ext') or determine_ext(src)
1274 if not ext:
1275 type_ = textstream.get('type')
1276 SUBTITLES_TYPES = {
1277 'text/vtt': 'vtt',
1278 'text/srt': 'srt',
1279 'application/smptett+xml': 'tt',
1280 }
1281 if type_ in SUBTITLES_TYPES:
1282 ext = SUBTITLES_TYPES[type_]
1283 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1284 subtitles.setdefault(lang, []).append({
1285 'url': src,
1286 'ext': ext,
1287 })
1288 return subtitles
1289
1290 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1291 xspf = self._download_xml(
1292 playlist_url, playlist_id, 'Downloading xpsf playlist',
1293 'Unable to download xspf manifest', fatal=fatal)
1294 if xspf is False:
1295 return []
1296 return self._parse_xspf(xspf, playlist_id)
1297
1298 def _parse_xspf(self, playlist, playlist_id):
1299 NS_MAP = {
1300 'xspf': 'http://xspf.org/ns/0/',
1301 's1': 'http://static.streamone.nl/player/ns/0',
1302 }
1303
1304 entries = []
1305 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1306 title = xpath_text(
1307 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1308 description = xpath_text(
1309 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1310 thumbnail = xpath_text(
1311 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1312 duration = float_or_none(
1313 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1314
1315 formats = [{
1316 'url': location.text,
1317 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1318 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1319 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1320 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1321 self._sort_formats(formats)
1322
1323 entries.append({
1324 'id': playlist_id,
1325 'title': title,
1326 'description': description,
1327 'thumbnail': thumbnail,
1328 'duration': duration,
1329 'formats': formats,
1330 })
1331 return entries
1332
1333 def _parse_dash_manifest(self, dash_doc, namespace=None, formats_dict={}):
1334 def _add_ns(path):
1335 return self._xpath_ns(path, namespace)
1336
1337 formats = []
1338 for a in dash_doc.findall('.//' + _add_ns('AdaptationSet')):
1339 mime_type = a.attrib.get('mimeType')
1340 for r in a.findall(_add_ns('Representation')):
1341 mime_type = r.attrib.get('mimeType') or mime_type
1342 url_el = r.find(_add_ns('BaseURL'))
1343 if mime_type == 'text/vtt':
1344 # TODO implement WebVTT downloading
1345 pass
1346 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
1347 segment_list = r.find(_add_ns('SegmentList'))
1348 format_id = r.attrib['id']
1349 video_url = url_el.text if url_el is not None else None
1350 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1351 f = {
1352 'format_id': format_id,
1353 'url': video_url,
1354 'width': int_or_none(r.attrib.get('width')),
1355 'height': int_or_none(r.attrib.get('height')),
1356 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1357 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1358 'filesize': filesize,
1359 'fps': int_or_none(r.attrib.get('frameRate')),
1360 }
1361 if segment_list is not None:
1362 initialization_url = segment_list.find(_add_ns('Initialization')).attrib['sourceURL']
1363 f.update({
1364 'initialization_url': initialization_url,
1365 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall(_add_ns('SegmentURL'))],
1366 'protocol': 'http_dash_segments',
1367 })
1368 if not f.get('url'):
1369 f['url'] = initialization_url
1370 try:
1371 existing_format = next(
1372 fo for fo in formats
1373 if fo['format_id'] == format_id)
1374 except StopIteration:
1375 full_info = formats_dict.get(format_id, {}).copy()
1376 full_info.update(f)
1377 codecs = r.attrib.get('codecs')
1378 if codecs:
1379 if mime_type.startswith('video/'):
1380 vcodec, acodec = codecs, 'none'
1381 else: # mime_type.startswith('audio/')
1382 vcodec, acodec = 'none', codecs
1383
1384 full_info.update({
1385 'vcodec': vcodec,
1386 'acodec': acodec,
1387 })
1388 formats.append(full_info)
1389 else:
1390 existing_format.update(f)
1391 else:
1392 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1393 return formats
1394
1395 def _live_title(self, name):
1396 """ Generate the title for a live video """
1397 now = datetime.datetime.now()
1398 now_str = now.strftime("%Y-%m-%d %H:%M")
1399 return name + ' ' + now_str
1400
1401 def _int(self, v, name, fatal=False, **kwargs):
1402 res = int_or_none(v, **kwargs)
1403 if 'get_attr' in kwargs:
1404 print(getattr(v, kwargs['get_attr']))
1405 if res is None:
1406 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1407 if fatal:
1408 raise ExtractorError(msg)
1409 else:
1410 self._downloader.report_warning(msg)
1411 return res
1412
1413 def _float(self, v, name, fatal=False, **kwargs):
1414 res = float_or_none(v, **kwargs)
1415 if res is None:
1416 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1417 if fatal:
1418 raise ExtractorError(msg)
1419 else:
1420 self._downloader.report_warning(msg)
1421 return res
1422
1423 def _set_cookie(self, domain, name, value, expire_time=None):
1424 cookie = compat_cookiejar.Cookie(
1425 0, name, value, None, None, domain, None,
1426 None, '/', True, False, expire_time, '', None, None, None)
1427 self._downloader.cookiejar.set_cookie(cookie)
1428
1429 def _get_cookies(self, url):
1430 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1431 req = sanitized_Request(url)
1432 self._downloader.cookiejar.add_cookie_header(req)
1433 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1434
1435 def get_testcases(self, include_onlymatching=False):
1436 t = getattr(self, '_TEST', None)
1437 if t:
1438 assert not hasattr(self, '_TESTS'), \
1439 '%s has _TEST and _TESTS' % type(self).__name__
1440 tests = [t]
1441 else:
1442 tests = getattr(self, '_TESTS', [])
1443 for t in tests:
1444 if not include_onlymatching and t.get('only_matching', False):
1445 continue
1446 t['name'] = type(self).__name__[:-len('IE')]
1447 yield t
1448
1449 def is_suitable(self, age_limit):
1450 """ Test whether the extractor is generally suitable for the given
1451 age limit (i.e. pornographic sites are not, all others usually are) """
1452
1453 any_restricted = False
1454 for tc in self.get_testcases(include_onlymatching=False):
1455 if 'playlist' in tc:
1456 tc = tc['playlist'][0]
1457 is_restricted = age_restricted(
1458 tc.get('info_dict', {}).get('age_limit'), age_limit)
1459 if not is_restricted:
1460 return True
1461 any_restricted = any_restricted or is_restricted
1462 return not any_restricted
1463
1464 def extract_subtitles(self, *args, **kwargs):
1465 if (self._downloader.params.get('writesubtitles', False) or
1466 self._downloader.params.get('listsubtitles')):
1467 return self._get_subtitles(*args, **kwargs)
1468 return {}
1469
1470 def _get_subtitles(self, *args, **kwargs):
1471 raise NotImplementedError("This method must be implemented by subclasses")
1472
1473 @staticmethod
1474 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1475 """ Merge subtitle items for one language. Items with duplicated URLs
1476 will be dropped. """
1477 list1_urls = set([item['url'] for item in subtitle_list1])
1478 ret = list(subtitle_list1)
1479 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1480 return ret
1481
1482 @classmethod
1483 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1484 """ Merge two subtitle dictionaries, language by language. """
1485 ret = dict(subtitle_dict1)
1486 for lang in subtitle_dict2:
1487 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1488 return ret
1489
1490 def extract_automatic_captions(self, *args, **kwargs):
1491 if (self._downloader.params.get('writeautomaticsub', False) or
1492 self._downloader.params.get('listsubtitles')):
1493 return self._get_automatic_captions(*args, **kwargs)
1494 return {}
1495
1496 def _get_automatic_captions(self, *args, **kwargs):
1497 raise NotImplementedError("This method must be implemented by subclasses")
1498
1499
1500 class SearchInfoExtractor(InfoExtractor):
1501 """
1502 Base class for paged search queries extractors.
1503 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1504 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1505 """
1506
1507 @classmethod
1508 def _make_valid_url(cls):
1509 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1510
1511 @classmethod
1512 def suitable(cls, url):
1513 return re.match(cls._make_valid_url(), url) is not None
1514
1515 def _real_extract(self, query):
1516 mobj = re.match(self._make_valid_url(), query)
1517 if mobj is None:
1518 raise ExtractorError('Invalid search query "%s"' % query)
1519
1520 prefix = mobj.group('prefix')
1521 query = mobj.group('query')
1522 if prefix == '':
1523 return self._get_n_results(query, 1)
1524 elif prefix == 'all':
1525 return self._get_n_results(query, self._MAX_RESULTS)
1526 else:
1527 n = int(prefix)
1528 if n <= 0:
1529 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1530 elif n > self._MAX_RESULTS:
1531 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1532 n = self._MAX_RESULTS
1533 return self._get_n_results(query, n)
1534
1535 def _get_n_results(self, query, n):
1536 """Get a specified number of results for a query"""
1537 raise NotImplementedError("This method must be implemented by subclasses")
1538
1539 @property
1540 def SEARCH_KEY(self):
1541 return self._SEARCH_KEY