]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[curiositystream] Add more metadata
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import itertools
8 import json
9 import netrc
10 import os
11 import random
12 import re
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar_Cookie,
19 compat_cookies_SimpleCookie,
20 compat_etree_Element,
21 compat_etree_fromstring,
22 compat_expanduser,
23 compat_getpass,
24 compat_http_client,
25 compat_os_name,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
31 compat_urlparse,
32 compat_xml_parse_error,
33 )
34 from ..downloader import FileDownloader
35 from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38 )
39 from ..utils import (
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 dict_get,
48 error_to_compat_str,
49 extract_attributes,
50 ExtractorError,
51 fix_xml_ampersands,
52 float_or_none,
53 format_field,
54 GeoRestrictedError,
55 GeoUtils,
56 int_or_none,
57 join_nonempty,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 network_exceptions,
62 NO_DEFAULT,
63 orderedSet,
64 parse_bitrate,
65 parse_codecs,
66 parse_duration,
67 parse_iso8601,
68 parse_m3u8_attributes,
69 parse_resolution,
70 RegexNotFoundError,
71 sanitize_filename,
72 sanitized_Request,
73 str_or_none,
74 str_to_int,
75 strip_or_none,
76 traverse_obj,
77 unescapeHTML,
78 UnsupportedError,
79 unified_strdate,
80 unified_timestamp,
81 update_Request,
82 update_url_query,
83 url_basename,
84 url_or_none,
85 urljoin,
86 variadic,
87 xpath_element,
88 xpath_text,
89 xpath_with_ns,
90 )
91
92
93 class InfoExtractor(object):
94 """Information Extractor class.
95
96 Information extractors are the classes that, given a URL, extract
97 information about the video (or videos) the URL refers to. This
98 information includes the real video URL, the video title, author and
99 others. The information is stored in a dictionary which is then
100 passed to the YoutubeDL. The YoutubeDL processes this
101 information possibly downloading the video to the file system, among
102 other possible outcomes.
103
104 The type field determines the type of the result.
105 By far the most common value (and the default if _type is missing) is
106 "video", which indicates a single video.
107
108 For a video, the dictionaries must include the following fields:
109
110 id: Video identifier.
111 title: Video title, unescaped.
112
113 Additionally, it must contain either a formats entry or a url one:
114
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
129 is parsed from a string (in case of
130 fragmented media)
131 for MSS - URL of the ISM manifest.
132 * manifest_url
133 The URL of the manifest file in case of
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
139 * ext Will be calculated from URL if missing
140 * format A human-readable description of the format
141 ("mp4 container with h264/opus").
142 Calculated from the format_id, width, height.
143 and format_note fields if missing.
144 * format_id A short description of the format
145 ("mp4_h264_opus" or "19").
146 Technically optional, but strongly recommended.
147 * format_note Additional info about the format
148 ("3D" or "DASH video")
149 * width Width of the video, if known
150 * height Height of the video, if known
151 * resolution Textual description of width and height
152 * dynamic_range The dynamic range of the video. One of:
153 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
154 * tbr Average bitrate of audio and video in KBit/s
155 * abr Average audio bitrate in KBit/s
156 * acodec Name of the audio codec in use
157 * asr Audio sampling rate in Hertz
158 * vbr Average video bitrate in KBit/s
159 * fps Frame rate
160 * vcodec Name of the video codec in use
161 * container Name of the container format
162 * filesize The number of bytes, if known in advance
163 * filesize_approx An estimate for the number of bytes
164 * player_url SWF Player URL (used for rtmpdump).
165 * protocol The protocol that will be used for the actual
166 download, lower-case.
167 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
168 "m3u8", "m3u8_native" or "http_dash_segments".
169 * fragment_base_url
170 Base URL for fragments. Each fragment's path
171 value (if present) will be relative to
172 this URL.
173 * fragments A list of fragments of a fragmented media.
174 Each fragment entry must contain either an url
175 or a path. If an url is present it should be
176 considered by a client. Otherwise both path and
177 fragment_base_url must be present. Here is
178 the list of all potential fields:
179 * "url" - fragment's URL
180 * "path" - fragment's path relative to
181 fragment_base_url
182 * "duration" (optional, int or float)
183 * "filesize" (optional, int)
184 * preference Order number of this format. If this field is
185 present and not None, the formats get sorted
186 by this field, regardless of all other values.
187 -1 for default (order by other properties),
188 -2 or smaller for less than default.
189 < -1000 to hide the format (if there is
190 another one which is strictly better)
191 * language Language code, e.g. "de" or "en-US".
192 * language_preference Is this in the language mentioned in
193 the URL?
194 10 if it's what the URL is about,
195 -1 for default (don't know),
196 -10 otherwise, other values reserved for now.
197 * quality Order number of the video quality of this
198 format, irrespective of the file format.
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
201 * source_preference Order number for this video source
202 (quality takes higher priority)
203 -1 for default (order by other properties),
204 -2 or smaller for less than default.
205 * http_headers A dictionary of additional HTTP headers
206 to add to the request.
207 * stretched_ratio If given and not 1, indicates that the
208 video's pixels are not square.
209 width : height ratio as float.
210 * no_resume The server does not support resuming the
211 (HTTP or RTMP) download. Boolean.
212 * has_drm The format has DRM and cannot be downloaded. Boolean
213 * downloader_options A dictionary of downloader options as
214 described in FileDownloader
215 RTMP formats can also have the additional fields: page_url,
216 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
217 rtmp_protocol, rtmp_real_time
218
219 url: Final video URL.
220 ext: Video filename extension.
221 format: The video format, defaults to ext (used for --get-format)
222 player_url: SWF Player URL (used for rtmpdump).
223
224 The following fields are optional:
225
226 alt_title: A secondary title of the video.
227 display_id An alternative identifier for the video, not necessarily
228 unique, but available before title. Typically, id is
229 something like "4234987", title "Dancing naked mole rats",
230 and display_id "dancing-naked-mole-rats"
231 thumbnails: A list of dictionaries, with the following entries:
232 * "id" (optional, string) - Thumbnail format ID
233 * "url"
234 * "preference" (optional, int) - quality of the image
235 * "width" (optional, int)
236 * "height" (optional, int)
237 * "resolution" (optional, string "{width}x{height}",
238 deprecated)
239 * "filesize" (optional, int)
240 thumbnail: Full URL to a video thumbnail image.
241 description: Full video description.
242 uploader: Full name of the video uploader.
243 license: License name the video is licensed under.
244 creator: The creator of the video.
245 release_timestamp: UNIX timestamp of the moment the video was released.
246 release_date: The date (YYYYMMDD) when the video was released.
247 timestamp: UNIX timestamp of the moment the video was uploaded
248 upload_date: Video upload date (YYYYMMDD).
249 If not explicitly set, calculated from timestamp.
250 uploader_id: Nickname or id of the video uploader.
251 uploader_url: Full URL to a personal webpage of the video uploader.
252 channel: Full name of the channel the video is uploaded on.
253 Note that channel fields may or may not repeat uploader
254 fields. This depends on a particular extractor.
255 channel_id: Id of the channel.
256 channel_url: Full URL to a channel webpage.
257 location: Physical location where the video was filmed.
258 subtitles: The available subtitles as a dictionary in the format
259 {tag: subformats}. "tag" is usually a language code, and
260 "subformats" is a list sorted from lower to higher
261 preference, each element is a dictionary with the "ext"
262 entry and one of:
263 * "data": The subtitles file contents
264 * "url": A URL pointing to the subtitles file
265 It can optionally also have:
266 * "name": Name or description of the subtitles
267 "ext" will be calculated from URL if missing
268 automatic_captions: Like 'subtitles'; contains automatically generated
269 captions instead of normal subtitles
270 duration: Length of the video in seconds, as an integer or float.
271 view_count: How many users have watched the video on the platform.
272 like_count: Number of positive ratings of the video
273 dislike_count: Number of negative ratings of the video
274 repost_count: Number of reposts of the video
275 average_rating: Average rating give by users, the scale used depends on the webpage
276 comment_count: Number of comments on the video
277 comments: A list of comments, each with one or more of the following
278 properties (all but one of text or html optional):
279 * "author" - human-readable name of the comment author
280 * "author_id" - user ID of the comment author
281 * "author_thumbnail" - The thumbnail of the comment author
282 * "id" - Comment ID
283 * "html" - Comment as HTML
284 * "text" - Plain text of the comment
285 * "timestamp" - UNIX timestamp of comment
286 * "parent" - ID of the comment this one is replying to.
287 Set to "root" to indicate that this is a
288 comment to the original video.
289 * "like_count" - Number of positive ratings of the comment
290 * "dislike_count" - Number of negative ratings of the comment
291 * "is_favorited" - Whether the comment is marked as
292 favorite by the video uploader
293 * "author_is_uploader" - Whether the comment is made by
294 the video uploader
295 age_limit: Age restriction for the video, as an integer (years)
296 webpage_url: The URL to the video webpage, if given to yt-dlp it
297 should allow to get the same result again. (It will be set
298 by YoutubeDL if it's missing)
299 categories: A list of categories that the video falls in, for example
300 ["Sports", "Berlin"]
301 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
302 cast: A list of the video cast
303 is_live: True, False, or None (=unknown). Whether this video is a
304 live stream that goes on instead of a fixed-length video.
305 was_live: True, False, or None (=unknown). Whether this video was
306 originally a live stream.
307 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
308 If absent, automatically set from is_live, was_live
309 start_time: Time in seconds where the reproduction should start, as
310 specified in the URL.
311 end_time: Time in seconds where the reproduction should end, as
312 specified in the URL.
313 chapters: A list of dictionaries, with the following entries:
314 * "start_time" - The start time of the chapter in seconds
315 * "end_time" - The end time of the chapter in seconds
316 * "title" (optional, string)
317 playable_in_embed: Whether this video is allowed to play in embedded
318 players on other sites. Can be True (=always allowed),
319 False (=never allowed), None (=unknown), or a string
320 specifying the criteria for embedability (Eg: 'whitelist')
321 availability: Under what condition the video is available. One of
322 'private', 'premium_only', 'subscriber_only', 'needs_auth',
323 'unlisted' or 'public'. Use 'InfoExtractor._availability'
324 to set it
325 __post_extractor: A function to be called just before the metadata is
326 written to either disk, logger or console. The function
327 must return a dict which will be added to the info_dict.
328 This is usefull for additional information that is
329 time-consuming to extract. Note that the fields thus
330 extracted will not be available to output template and
331 match_filter. So, only "comments" and "comment_count" are
332 currently allowed to be extracted via this method.
333
334 The following fields should only be used when the video belongs to some logical
335 chapter or section:
336
337 chapter: Name or title of the chapter the video belongs to.
338 chapter_number: Number of the chapter the video belongs to, as an integer.
339 chapter_id: Id of the chapter the video belongs to, as a unicode string.
340
341 The following fields should only be used when the video is an episode of some
342 series, programme or podcast:
343
344 series: Title of the series or programme the video episode belongs to.
345 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
346 season: Title of the season the video episode belongs to.
347 season_number: Number of the season the video episode belongs to, as an integer.
348 season_id: Id of the season the video episode belongs to, as a unicode string.
349 episode: Title of the video episode. Unlike mandatory video title field,
350 this field should denote the exact title of the video episode
351 without any kind of decoration.
352 episode_number: Number of the video episode within a season, as an integer.
353 episode_id: Id of the video episode, as a unicode string.
354
355 The following fields should only be used when the media is a track or a part of
356 a music album:
357
358 track: Title of the track.
359 track_number: Number of the track within an album or a disc, as an integer.
360 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
361 as a unicode string.
362 artist: Artist(s) of the track.
363 genre: Genre(s) of the track.
364 album: Title of the album the track belongs to.
365 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
366 album_artist: List of all artists appeared on the album (e.g.
367 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
368 and compilations).
369 disc_number: Number of the disc or other physical medium the track belongs to,
370 as an integer.
371 release_year: Year (YYYY) when the album was released.
372
373 Unless mentioned otherwise, the fields should be Unicode strings.
374
375 Unless mentioned otherwise, None is equivalent to absence of information.
376
377
378 _type "playlist" indicates multiple videos.
379 There must be a key "entries", which is a list, an iterable, or a PagedList
380 object, each element of which is a valid dictionary by this specification.
381
382 Additionally, playlists can have "id", "title", and any other relevent
383 attributes with the same semantics as videos (see above).
384
385
386 _type "multi_video" indicates that there are multiple videos that
387 form a single show, for examples multiple acts of an opera or TV episode.
388 It must have an entries key like a playlist and contain all the keys
389 required for a video at the same time.
390
391
392 _type "url" indicates that the video must be extracted from another
393 location, possibly by a different extractor. Its only required key is:
394 "url" - the next URL to extract.
395 The key "ie_key" can be set to the class name (minus the trailing "IE",
396 e.g. "Youtube") if the extractor class is known in advance.
397 Additionally, the dictionary may have any properties of the resolved entity
398 known in advance, for example "title" if the title of the referred video is
399 known ahead of time.
400
401
402 _type "url_transparent" entities have the same specification as "url", but
403 indicate that the given additional information is more precise than the one
404 associated with the resolved URL.
405 This is useful when a site employs a video service that hosts the video and
406 its technical metadata, but that video service does not embed a useful
407 title, description etc.
408
409
410 Subclasses of this one should re-define the _real_initialize() and
411 _real_extract() methods and define a _VALID_URL regexp.
412 Probably, they should also be added to the list of extractors.
413
414 Subclasses may also override suitable() if necessary, but ensure the function
415 signature is preserved and that this function imports everything it needs
416 (except other extractors), so that lazy_extractors works correctly
417
418 _GEO_BYPASS attribute may be set to False in order to disable
419 geo restriction bypass mechanisms for a particular extractor.
420 Though it won't disable explicit geo restriction bypass based on
421 country code provided with geo_bypass_country.
422
423 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
424 countries for this extractor. One of these countries will be used by
425 geo restriction bypass mechanism right away in order to bypass
426 geo restriction, of course, if the mechanism is not disabled.
427
428 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
429 IP blocks in CIDR notation for this extractor. One of these IP blocks
430 will be used by geo restriction bypass mechanism similarly
431 to _GEO_COUNTRIES.
432
433 The _WORKING attribute should be set to False for broken IEs
434 in order to warn the users and skip the tests.
435 """
436
437 _ready = False
438 _downloader = None
439 _x_forwarded_for_ip = None
440 _GEO_BYPASS = True
441 _GEO_COUNTRIES = None
442 _GEO_IP_BLOCKS = None
443 _WORKING = True
444
445 _LOGIN_HINTS = {
446 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
447 'cookies': (
448 'Use --cookies-from-browser or --cookies for the authentication. '
449 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
450 'password': 'Use --username and --password, or --netrc to provide account credentials',
451 }
452
453 def __init__(self, downloader=None):
454 """Constructor. Receives an optional downloader (a YoutubeDL instance).
455 If a downloader is not passed during initialization,
456 it must be set using "set_downloader()" before "extract()" is called"""
457 self._ready = False
458 self._x_forwarded_for_ip = None
459 self._printed_messages = set()
460 self.set_downloader(downloader)
461
462 @classmethod
463 def _match_valid_url(cls, url):
464 # This does not use has/getattr intentionally - we want to know whether
465 # we have cached the regexp for *this* class, whereas getattr would also
466 # match the superclass
467 if '_VALID_URL_RE' not in cls.__dict__:
468 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
469 return cls._VALID_URL_RE.match(url)
470
471 @classmethod
472 def suitable(cls, url):
473 """Receives a URL and returns True if suitable for this IE."""
474 # This function must import everything it needs (except other extractors),
475 # so that lazy_extractors works correctly
476 return cls._match_valid_url(url) is not None
477
478 @classmethod
479 def _match_id(cls, url):
480 return cls._match_valid_url(url).group('id')
481
482 @classmethod
483 def get_temp_id(cls, url):
484 try:
485 return cls._match_id(url)
486 except (IndexError, AttributeError):
487 return None
488
489 @classmethod
490 def working(cls):
491 """Getter method for _WORKING."""
492 return cls._WORKING
493
494 def initialize(self):
495 """Initializes an instance (authentication, etc)."""
496 self._printed_messages = set()
497 self._initialize_geo_bypass({
498 'countries': self._GEO_COUNTRIES,
499 'ip_blocks': self._GEO_IP_BLOCKS,
500 })
501 if not self._ready:
502 self._real_initialize()
503 self._ready = True
504
505 def _initialize_geo_bypass(self, geo_bypass_context):
506 """
507 Initialize geo restriction bypass mechanism.
508
509 This method is used to initialize geo bypass mechanism based on faking
510 X-Forwarded-For HTTP header. A random country from provided country list
511 is selected and a random IP belonging to this country is generated. This
512 IP will be passed as X-Forwarded-For HTTP header in all subsequent
513 HTTP requests.
514
515 This method will be used for initial geo bypass mechanism initialization
516 during the instance initialization with _GEO_COUNTRIES and
517 _GEO_IP_BLOCKS.
518
519 You may also manually call it from extractor's code if geo bypass
520 information is not available beforehand (e.g. obtained during
521 extraction) or due to some other reason. In this case you should pass
522 this information in geo bypass context passed as first argument. It may
523 contain following fields:
524
525 countries: List of geo unrestricted countries (similar
526 to _GEO_COUNTRIES)
527 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
528 (similar to _GEO_IP_BLOCKS)
529
530 """
531 if not self._x_forwarded_for_ip:
532
533 # Geo bypass mechanism is explicitly disabled by user
534 if not self.get_param('geo_bypass', True):
535 return
536
537 if not geo_bypass_context:
538 geo_bypass_context = {}
539
540 # Backward compatibility: previously _initialize_geo_bypass
541 # expected a list of countries, some 3rd party code may still use
542 # it this way
543 if isinstance(geo_bypass_context, (list, tuple)):
544 geo_bypass_context = {
545 'countries': geo_bypass_context,
546 }
547
548 # The whole point of geo bypass mechanism is to fake IP
549 # as X-Forwarded-For HTTP header based on some IP block or
550 # country code.
551
552 # Path 1: bypassing based on IP block in CIDR notation
553
554 # Explicit IP block specified by user, use it right away
555 # regardless of whether extractor is geo bypassable or not
556 ip_block = self.get_param('geo_bypass_ip_block', None)
557
558 # Otherwise use random IP block from geo bypass context but only
559 # if extractor is known as geo bypassable
560 if not ip_block:
561 ip_blocks = geo_bypass_context.get('ip_blocks')
562 if self._GEO_BYPASS and ip_blocks:
563 ip_block = random.choice(ip_blocks)
564
565 if ip_block:
566 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
567 self._downloader.write_debug(
568 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
569 return
570
571 # Path 2: bypassing based on country code
572
573 # Explicit country code specified by user, use it right away
574 # regardless of whether extractor is geo bypassable or not
575 country = self.get_param('geo_bypass_country', None)
576
577 # Otherwise use random country code from geo bypass context but
578 # only if extractor is known as geo bypassable
579 if not country:
580 countries = geo_bypass_context.get('countries')
581 if self._GEO_BYPASS and countries:
582 country = random.choice(countries)
583
584 if country:
585 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
586 self._downloader.write_debug(
587 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
588
589 def extract(self, url):
590 """Extracts URL information and returns it in list of dicts."""
591 try:
592 for _ in range(2):
593 try:
594 self.initialize()
595 self.write_debug('Extracting URL: %s' % url)
596 ie_result = self._real_extract(url)
597 if ie_result is None:
598 return None
599 if self._x_forwarded_for_ip:
600 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
601 subtitles = ie_result.get('subtitles')
602 if (subtitles and 'live_chat' in subtitles
603 and 'no-live-chat' in self.get_param('compat_opts', [])):
604 del subtitles['live_chat']
605 return ie_result
606 except GeoRestrictedError as e:
607 if self.__maybe_fake_ip_and_retry(e.countries):
608 continue
609 raise
610 except UnsupportedError:
611 raise
612 except ExtractorError as e:
613 kwargs = {
614 'video_id': e.video_id or self.get_temp_id(url),
615 'ie': self.IE_NAME,
616 'tb': e.traceback,
617 'expected': e.expected,
618 'cause': e.cause
619 }
620 if hasattr(e, 'countries'):
621 kwargs['countries'] = e.countries
622 raise type(e)(e.msg, **kwargs)
623 except compat_http_client.IncompleteRead as e:
624 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
625 except (KeyError, StopIteration) as e:
626 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
627
628 def __maybe_fake_ip_and_retry(self, countries):
629 if (not self.get_param('geo_bypass_country', None)
630 and self._GEO_BYPASS
631 and self.get_param('geo_bypass', True)
632 and not self._x_forwarded_for_ip
633 and countries):
634 country_code = random.choice(countries)
635 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
636 if self._x_forwarded_for_ip:
637 self.report_warning(
638 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
639 % (self._x_forwarded_for_ip, country_code.upper()))
640 return True
641 return False
642
643 def set_downloader(self, downloader):
644 """Sets the downloader for this IE."""
645 self._downloader = downloader
646
647 def _real_initialize(self):
648 """Real initialization process. Redefine in subclasses."""
649 pass
650
651 def _real_extract(self, url):
652 """Real extraction process. Redefine in subclasses."""
653 pass
654
655 @classmethod
656 def ie_key(cls):
657 """A string for getting the InfoExtractor with get_info_extractor"""
658 return cls.__name__[:-2]
659
660 @property
661 def IE_NAME(self):
662 return compat_str(type(self).__name__[:-2])
663
664 @staticmethod
665 def __can_accept_status_code(err, expected_status):
666 assert isinstance(err, compat_urllib_error.HTTPError)
667 if expected_status is None:
668 return False
669 elif callable(expected_status):
670 return expected_status(err.code) is True
671 else:
672 return err.code in variadic(expected_status)
673
674 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
675 """
676 Return the response handle.
677
678 See _download_webpage docstring for arguments specification.
679 """
680 if not self._downloader._first_webpage_request:
681 sleep_interval = self.get_param('sleep_interval_requests') or 0
682 if sleep_interval > 0:
683 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
684 time.sleep(sleep_interval)
685 else:
686 self._downloader._first_webpage_request = False
687
688 if note is None:
689 self.report_download_webpage(video_id)
690 elif note is not False:
691 if video_id is None:
692 self.to_screen('%s' % (note,))
693 else:
694 self.to_screen('%s: %s' % (video_id, note))
695
696 # Some sites check X-Forwarded-For HTTP header in order to figure out
697 # the origin of the client behind proxy. This allows bypassing geo
698 # restriction by faking this header's value to IP that belongs to some
699 # geo unrestricted country. We will do so once we encounter any
700 # geo restriction error.
701 if self._x_forwarded_for_ip:
702 if 'X-Forwarded-For' not in headers:
703 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
704
705 if isinstance(url_or_request, compat_urllib_request.Request):
706 url_or_request = update_Request(
707 url_or_request, data=data, headers=headers, query=query)
708 else:
709 if query:
710 url_or_request = update_url_query(url_or_request, query)
711 if data is not None or headers:
712 url_or_request = sanitized_Request(url_or_request, data, headers)
713 try:
714 return self._downloader.urlopen(url_or_request)
715 except network_exceptions as err:
716 if isinstance(err, compat_urllib_error.HTTPError):
717 if self.__can_accept_status_code(err, expected_status):
718 # Retain reference to error to prevent file object from
719 # being closed before it can be read. Works around the
720 # effects of <https://bugs.python.org/issue15002>
721 # introduced in Python 3.4.1.
722 err.fp._error = err
723 return err.fp
724
725 if errnote is False:
726 return False
727 if errnote is None:
728 errnote = 'Unable to download webpage'
729
730 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
731 if fatal:
732 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
733 else:
734 self.report_warning(errmsg)
735 return False
736
737 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
738 """
739 Return a tuple (page content as string, URL handle).
740
741 See _download_webpage docstring for arguments specification.
742 """
743 # Strip hashes from the URL (#1038)
744 if isinstance(url_or_request, (compat_str, str)):
745 url_or_request = url_or_request.partition('#')[0]
746
747 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
748 if urlh is False:
749 assert not fatal
750 return False
751 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
752 return (content, urlh)
753
754 @staticmethod
755 def _guess_encoding_from_content(content_type, webpage_bytes):
756 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
757 if m:
758 encoding = m.group(1)
759 else:
760 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
761 webpage_bytes[:1024])
762 if m:
763 encoding = m.group(1).decode('ascii')
764 elif webpage_bytes.startswith(b'\xff\xfe'):
765 encoding = 'utf-16'
766 else:
767 encoding = 'utf-8'
768
769 return encoding
770
771 def __check_blocked(self, content):
772 first_block = content[:512]
773 if ('<title>Access to this site is blocked</title>' in content
774 and 'Websense' in first_block):
775 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
776 blocked_iframe = self._html_search_regex(
777 r'<iframe src="([^"]+)"', content,
778 'Websense information URL', default=None)
779 if blocked_iframe:
780 msg += ' Visit %s for more details' % blocked_iframe
781 raise ExtractorError(msg, expected=True)
782 if '<title>The URL you requested has been blocked</title>' in first_block:
783 msg = (
784 'Access to this webpage has been blocked by Indian censorship. '
785 'Use a VPN or proxy server (with --proxy) to route around it.')
786 block_msg = self._html_search_regex(
787 r'</h1><p>(.*?)</p>',
788 content, 'block message', default=None)
789 if block_msg:
790 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
791 raise ExtractorError(msg, expected=True)
792 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
793 and 'blocklist.rkn.gov.ru' in content):
794 raise ExtractorError(
795 'Access to this webpage has been blocked by decision of the Russian government. '
796 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
797 expected=True)
798
799 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
800 content_type = urlh.headers.get('Content-Type', '')
801 webpage_bytes = urlh.read()
802 if prefix is not None:
803 webpage_bytes = prefix + webpage_bytes
804 if not encoding:
805 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
806 if self.get_param('dump_intermediate_pages', False):
807 self.to_screen('Dumping request to ' + urlh.geturl())
808 dump = base64.b64encode(webpage_bytes).decode('ascii')
809 self._downloader.to_screen(dump)
810 if self.get_param('write_pages', False):
811 basen = '%s_%s' % (video_id, urlh.geturl())
812 trim_length = self.get_param('trim_file_name') or 240
813 if len(basen) > trim_length:
814 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
815 basen = basen[:trim_length - len(h)] + h
816 raw_filename = basen + '.dump'
817 filename = sanitize_filename(raw_filename, restricted=True)
818 self.to_screen('Saving request to ' + filename)
819 # Working around MAX_PATH limitation on Windows (see
820 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
821 if compat_os_name == 'nt':
822 absfilepath = os.path.abspath(filename)
823 if len(absfilepath) > 259:
824 filename = '\\\\?\\' + absfilepath
825 with open(filename, 'wb') as outf:
826 outf.write(webpage_bytes)
827
828 try:
829 content = webpage_bytes.decode(encoding, 'replace')
830 except LookupError:
831 content = webpage_bytes.decode('utf-8', 'replace')
832
833 self.__check_blocked(content)
834
835 return content
836
837 def _download_webpage(
838 self, url_or_request, video_id, note=None, errnote=None,
839 fatal=True, tries=1, timeout=5, encoding=None, data=None,
840 headers={}, query={}, expected_status=None):
841 """
842 Return the data of the page as a string.
843
844 Arguments:
845 url_or_request -- plain text URL as a string or
846 a compat_urllib_request.Requestobject
847 video_id -- Video/playlist/item identifier (string)
848
849 Keyword arguments:
850 note -- note printed before downloading (string)
851 errnote -- note printed in case of an error (string)
852 fatal -- flag denoting whether error should be considered fatal,
853 i.e. whether it should cause ExtractionError to be raised,
854 otherwise a warning will be reported and extraction continued
855 tries -- number of tries
856 timeout -- sleep interval between tries
857 encoding -- encoding for a page content decoding, guessed automatically
858 when not explicitly specified
859 data -- POST data (bytes)
860 headers -- HTTP headers (dict)
861 query -- URL query (dict)
862 expected_status -- allows to accept failed HTTP requests (non 2xx
863 status code) by explicitly specifying a set of accepted status
864 codes. Can be any of the following entities:
865 - an integer type specifying an exact failed status code to
866 accept
867 - a list or a tuple of integer types specifying a list of
868 failed status codes to accept
869 - a callable accepting an actual failed status code and
870 returning True if it should be accepted
871 Note that this argument does not affect success status codes (2xx)
872 which are always accepted.
873 """
874
875 success = False
876 try_count = 0
877 while success is False:
878 try:
879 res = self._download_webpage_handle(
880 url_or_request, video_id, note, errnote, fatal,
881 encoding=encoding, data=data, headers=headers, query=query,
882 expected_status=expected_status)
883 success = True
884 except compat_http_client.IncompleteRead as e:
885 try_count += 1
886 if try_count >= tries:
887 raise e
888 self._sleep(timeout, video_id)
889 if res is False:
890 return res
891 else:
892 content, _ = res
893 return content
894
895 def _download_xml_handle(
896 self, url_or_request, video_id, note='Downloading XML',
897 errnote='Unable to download XML', transform_source=None,
898 fatal=True, encoding=None, data=None, headers={}, query={},
899 expected_status=None):
900 """
901 Return a tuple (xml as an compat_etree_Element, URL handle).
902
903 See _download_webpage docstring for arguments specification.
904 """
905 res = self._download_webpage_handle(
906 url_or_request, video_id, note, errnote, fatal=fatal,
907 encoding=encoding, data=data, headers=headers, query=query,
908 expected_status=expected_status)
909 if res is False:
910 return res
911 xml_string, urlh = res
912 return self._parse_xml(
913 xml_string, video_id, transform_source=transform_source,
914 fatal=fatal), urlh
915
916 def _download_xml(
917 self, url_or_request, video_id,
918 note='Downloading XML', errnote='Unable to download XML',
919 transform_source=None, fatal=True, encoding=None,
920 data=None, headers={}, query={}, expected_status=None):
921 """
922 Return the xml as an compat_etree_Element.
923
924 See _download_webpage docstring for arguments specification.
925 """
926 res = self._download_xml_handle(
927 url_or_request, video_id, note=note, errnote=errnote,
928 transform_source=transform_source, fatal=fatal, encoding=encoding,
929 data=data, headers=headers, query=query,
930 expected_status=expected_status)
931 return res if res is False else res[0]
932
933 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
934 if transform_source:
935 xml_string = transform_source(xml_string)
936 try:
937 return compat_etree_fromstring(xml_string.encode('utf-8'))
938 except compat_xml_parse_error as ve:
939 errmsg = '%s: Failed to parse XML ' % video_id
940 if fatal:
941 raise ExtractorError(errmsg, cause=ve)
942 else:
943 self.report_warning(errmsg + str(ve))
944
945 def _download_json_handle(
946 self, url_or_request, video_id, note='Downloading JSON metadata',
947 errnote='Unable to download JSON metadata', transform_source=None,
948 fatal=True, encoding=None, data=None, headers={}, query={},
949 expected_status=None):
950 """
951 Return a tuple (JSON object, URL handle).
952
953 See _download_webpage docstring for arguments specification.
954 """
955 res = self._download_webpage_handle(
956 url_or_request, video_id, note, errnote, fatal=fatal,
957 encoding=encoding, data=data, headers=headers, query=query,
958 expected_status=expected_status)
959 if res is False:
960 return res
961 json_string, urlh = res
962 return self._parse_json(
963 json_string, video_id, transform_source=transform_source,
964 fatal=fatal), urlh
965
966 def _download_json(
967 self, url_or_request, video_id, note='Downloading JSON metadata',
968 errnote='Unable to download JSON metadata', transform_source=None,
969 fatal=True, encoding=None, data=None, headers={}, query={},
970 expected_status=None):
971 """
972 Return the JSON object as a dict.
973
974 See _download_webpage docstring for arguments specification.
975 """
976 res = self._download_json_handle(
977 url_or_request, video_id, note=note, errnote=errnote,
978 transform_source=transform_source, fatal=fatal, encoding=encoding,
979 data=data, headers=headers, query=query,
980 expected_status=expected_status)
981 return res if res is False else res[0]
982
983 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
984 if transform_source:
985 json_string = transform_source(json_string)
986 try:
987 return json.loads(json_string)
988 except ValueError as ve:
989 errmsg = '%s: Failed to parse JSON ' % video_id
990 if fatal:
991 raise ExtractorError(errmsg, cause=ve)
992 else:
993 self.report_warning(errmsg + str(ve))
994
995 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
996 return self._parse_json(
997 data[data.find('{'):data.rfind('}') + 1],
998 video_id, transform_source, fatal)
999
1000 def _download_socket_json_handle(
1001 self, url_or_request, video_id, note='Polling socket',
1002 errnote='Unable to poll socket', transform_source=None,
1003 fatal=True, encoding=None, data=None, headers={}, query={},
1004 expected_status=None):
1005 """
1006 Return a tuple (JSON object, URL handle).
1007
1008 See _download_webpage docstring for arguments specification.
1009 """
1010 res = self._download_webpage_handle(
1011 url_or_request, video_id, note, errnote, fatal=fatal,
1012 encoding=encoding, data=data, headers=headers, query=query,
1013 expected_status=expected_status)
1014 if res is False:
1015 return res
1016 webpage, urlh = res
1017 return self._parse_socket_response_as_json(
1018 webpage, video_id, transform_source=transform_source,
1019 fatal=fatal), urlh
1020
1021 def _download_socket_json(
1022 self, url_or_request, video_id, note='Polling socket',
1023 errnote='Unable to poll socket', transform_source=None,
1024 fatal=True, encoding=None, data=None, headers={}, query={},
1025 expected_status=None):
1026 """
1027 Return the JSON object as a dict.
1028
1029 See _download_webpage docstring for arguments specification.
1030 """
1031 res = self._download_socket_json_handle(
1032 url_or_request, video_id, note=note, errnote=errnote,
1033 transform_source=transform_source, fatal=fatal, encoding=encoding,
1034 data=data, headers=headers, query=query,
1035 expected_status=expected_status)
1036 return res if res is False else res[0]
1037
1038 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1039 idstr = format_field(video_id, template='%s: ')
1040 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1041 if only_once:
1042 if f'WARNING: {msg}' in self._printed_messages:
1043 return
1044 self._printed_messages.add(f'WARNING: {msg}')
1045 self._downloader.report_warning(msg, *args, **kwargs)
1046
1047 def to_screen(self, msg, *args, **kwargs):
1048 """Print msg to screen, prefixing it with '[ie_name]'"""
1049 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1050
1051 def write_debug(self, msg, *args, **kwargs):
1052 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1053
1054 def get_param(self, name, default=None, *args, **kwargs):
1055 if self._downloader:
1056 return self._downloader.params.get(name, default, *args, **kwargs)
1057 return default
1058
1059 def report_drm(self, video_id, partial=False):
1060 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1061
1062 def report_extraction(self, id_or_name):
1063 """Report information extraction."""
1064 self.to_screen('%s: Extracting information' % id_or_name)
1065
1066 def report_download_webpage(self, video_id):
1067 """Report webpage download."""
1068 self.to_screen('%s: Downloading webpage' % video_id)
1069
1070 def report_age_confirmation(self):
1071 """Report attempt to confirm age."""
1072 self.to_screen('Confirming age')
1073
1074 def report_login(self):
1075 """Report attempt to log in."""
1076 self.to_screen('Logging in')
1077
1078 def raise_login_required(
1079 self, msg='This video is only available for registered users',
1080 metadata_available=False, method='any'):
1081 if metadata_available and self.get_param('ignore_no_formats_error'):
1082 self.report_warning(msg)
1083 if method is not None:
1084 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1085 raise ExtractorError(msg, expected=True)
1086
1087 def raise_geo_restricted(
1088 self, msg='This video is not available from your location due to geo restriction',
1089 countries=None, metadata_available=False):
1090 if metadata_available and self.get_param('ignore_no_formats_error'):
1091 self.report_warning(msg)
1092 else:
1093 raise GeoRestrictedError(msg, countries=countries)
1094
1095 def raise_no_formats(self, msg, expected=False, video_id=None):
1096 if expected and self.get_param('ignore_no_formats_error'):
1097 self.report_warning(msg, video_id)
1098 elif isinstance(msg, ExtractorError):
1099 raise msg
1100 else:
1101 raise ExtractorError(msg, expected=expected, video_id=video_id)
1102
1103 # Methods for following #608
1104 @staticmethod
1105 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1106 """Returns a URL that points to a page that should be processed"""
1107 # TODO: ie should be the class used for getting the info
1108 video_info = {'_type': 'url',
1109 'url': url,
1110 'ie_key': ie}
1111 video_info.update(kwargs)
1112 if video_id is not None:
1113 video_info['id'] = video_id
1114 if video_title is not None:
1115 video_info['title'] = video_title
1116 return video_info
1117
1118 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1119 urls = orderedSet(
1120 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1121 for m in matches)
1122 return self.playlist_result(
1123 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1124
1125 @staticmethod
1126 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1127 """Returns a playlist"""
1128 video_info = {'_type': 'playlist',
1129 'entries': entries}
1130 video_info.update(kwargs)
1131 if playlist_id:
1132 video_info['id'] = playlist_id
1133 if playlist_title:
1134 video_info['title'] = playlist_title
1135 if playlist_description is not None:
1136 video_info['description'] = playlist_description
1137 return video_info
1138
1139 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1140 """
1141 Perform a regex search on the given string, using a single or a list of
1142 patterns returning the first matching group.
1143 In case of failure return a default value or raise a WARNING or a
1144 RegexNotFoundError, depending on fatal, specifying the field name.
1145 """
1146 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1147 mobj = re.search(pattern, string, flags)
1148 else:
1149 for p in pattern:
1150 mobj = re.search(p, string, flags)
1151 if mobj:
1152 break
1153
1154 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1155
1156 if mobj:
1157 if group is None:
1158 # return the first matching group
1159 return next(g for g in mobj.groups() if g is not None)
1160 elif isinstance(group, (list, tuple)):
1161 return tuple(mobj.group(g) for g in group)
1162 else:
1163 return mobj.group(group)
1164 elif default is not NO_DEFAULT:
1165 return default
1166 elif fatal:
1167 raise RegexNotFoundError('Unable to extract %s' % _name)
1168 else:
1169 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1170 return None
1171
1172 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1173 """
1174 Like _search_regex, but strips HTML tags and unescapes entities.
1175 """
1176 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1177 if res:
1178 return clean_html(res).strip()
1179 else:
1180 return res
1181
1182 def _get_netrc_login_info(self, netrc_machine=None):
1183 username = None
1184 password = None
1185 netrc_machine = netrc_machine or self._NETRC_MACHINE
1186
1187 if self.get_param('usenetrc', False):
1188 try:
1189 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1190 if os.path.isdir(netrc_file):
1191 netrc_file = os.path.join(netrc_file, '.netrc')
1192 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1193 if info is not None:
1194 username = info[0]
1195 password = info[2]
1196 else:
1197 raise netrc.NetrcParseError(
1198 'No authenticators for %s' % netrc_machine)
1199 except (IOError, netrc.NetrcParseError) as err:
1200 self.report_warning(
1201 'parsing .netrc: %s' % error_to_compat_str(err))
1202
1203 return username, password
1204
1205 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1206 """
1207 Get the login info as (username, password)
1208 First look for the manually specified credentials using username_option
1209 and password_option as keys in params dictionary. If no such credentials
1210 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1211 value.
1212 If there's no info available, return (None, None)
1213 """
1214
1215 # Attempt to use provided username and password or .netrc data
1216 username = self.get_param(username_option)
1217 if username is not None:
1218 password = self.get_param(password_option)
1219 else:
1220 username, password = self._get_netrc_login_info(netrc_machine)
1221
1222 return username, password
1223
1224 def _get_tfa_info(self, note='two-factor verification code'):
1225 """
1226 Get the two-factor authentication info
1227 TODO - asking the user will be required for sms/phone verify
1228 currently just uses the command line option
1229 If there's no info available, return None
1230 """
1231
1232 tfa = self.get_param('twofactor')
1233 if tfa is not None:
1234 return tfa
1235
1236 return compat_getpass('Type %s and press [Return]: ' % note)
1237
1238 # Helper functions for extracting OpenGraph info
1239 @staticmethod
1240 def _og_regexes(prop):
1241 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1242 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1243 % {'prop': re.escape(prop)})
1244 template = r'<meta[^>]+?%s[^>]+?%s'
1245 return [
1246 template % (property_re, content_re),
1247 template % (content_re, property_re),
1248 ]
1249
1250 @staticmethod
1251 def _meta_regex(prop):
1252 return r'''(?isx)<meta
1253 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1254 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1255
1256 def _og_search_property(self, prop, html, name=None, **kargs):
1257 prop = variadic(prop)
1258 if name is None:
1259 name = 'OpenGraph %s' % prop[0]
1260 og_regexes = []
1261 for p in prop:
1262 og_regexes.extend(self._og_regexes(p))
1263 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1264 if escaped is None:
1265 return None
1266 return unescapeHTML(escaped)
1267
1268 def _og_search_thumbnail(self, html, **kargs):
1269 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1270
1271 def _og_search_description(self, html, **kargs):
1272 return self._og_search_property('description', html, fatal=False, **kargs)
1273
1274 def _og_search_title(self, html, **kargs):
1275 return self._og_search_property('title', html, **kargs)
1276
1277 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1278 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1279 if secure:
1280 regexes = self._og_regexes('video:secure_url') + regexes
1281 return self._html_search_regex(regexes, html, name, **kargs)
1282
1283 def _og_search_url(self, html, **kargs):
1284 return self._og_search_property('url', html, **kargs)
1285
1286 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1287 name = variadic(name)
1288 if display_name is None:
1289 display_name = name[0]
1290 return self._html_search_regex(
1291 [self._meta_regex(n) for n in name],
1292 html, display_name, fatal=fatal, group='content', **kwargs)
1293
1294 def _dc_search_uploader(self, html):
1295 return self._html_search_meta('dc.creator', html, 'uploader')
1296
1297 def _rta_search(self, html):
1298 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1299 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1300 r' content="RTA-5042-1996-1400-1577-RTA"',
1301 html):
1302 return 18
1303 return 0
1304
1305 def _media_rating_search(self, html):
1306 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1307 rating = self._html_search_meta('rating', html)
1308
1309 if not rating:
1310 return None
1311
1312 RATING_TABLE = {
1313 'safe for kids': 0,
1314 'general': 8,
1315 '14 years': 14,
1316 'mature': 17,
1317 'restricted': 19,
1318 }
1319 return RATING_TABLE.get(rating.lower())
1320
1321 def _family_friendly_search(self, html):
1322 # See http://schema.org/VideoObject
1323 family_friendly = self._html_search_meta(
1324 'isFamilyFriendly', html, default=None)
1325
1326 if not family_friendly:
1327 return None
1328
1329 RATING_TABLE = {
1330 '1': 0,
1331 'true': 0,
1332 '0': 18,
1333 'false': 18,
1334 }
1335 return RATING_TABLE.get(family_friendly.lower())
1336
1337 def _twitter_search_player(self, html):
1338 return self._html_search_meta('twitter:player', html,
1339 'twitter card player')
1340
1341 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1342 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1343 default = kwargs.get('default', NO_DEFAULT)
1344 # JSON-LD may be malformed and thus `fatal` should be respected.
1345 # At the same time `default` may be passed that assumes `fatal=False`
1346 # for _search_regex. Let's simulate the same behavior here as well.
1347 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1348 json_ld = []
1349 for mobj in json_ld_list:
1350 json_ld_item = self._parse_json(
1351 mobj.group('json_ld'), video_id, fatal=fatal)
1352 if not json_ld_item:
1353 continue
1354 if isinstance(json_ld_item, dict):
1355 json_ld.append(json_ld_item)
1356 elif isinstance(json_ld_item, (list, tuple)):
1357 json_ld.extend(json_ld_item)
1358 if json_ld:
1359 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1360 if json_ld:
1361 return json_ld
1362 if default is not NO_DEFAULT:
1363 return default
1364 elif fatal:
1365 raise RegexNotFoundError('Unable to extract JSON-LD')
1366 else:
1367 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1368 return {}
1369
1370 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1371 if isinstance(json_ld, compat_str):
1372 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1373 if not json_ld:
1374 return {}
1375 info = {}
1376 if not isinstance(json_ld, (list, tuple, dict)):
1377 return info
1378 if isinstance(json_ld, dict):
1379 json_ld = [json_ld]
1380
1381 INTERACTION_TYPE_MAP = {
1382 'CommentAction': 'comment',
1383 'AgreeAction': 'like',
1384 'DisagreeAction': 'dislike',
1385 'LikeAction': 'like',
1386 'DislikeAction': 'dislike',
1387 'ListenAction': 'view',
1388 'WatchAction': 'view',
1389 'ViewAction': 'view',
1390 }
1391
1392 def extract_interaction_type(e):
1393 interaction_type = e.get('interactionType')
1394 if isinstance(interaction_type, dict):
1395 interaction_type = interaction_type.get('@type')
1396 return str_or_none(interaction_type)
1397
1398 def extract_interaction_statistic(e):
1399 interaction_statistic = e.get('interactionStatistic')
1400 if isinstance(interaction_statistic, dict):
1401 interaction_statistic = [interaction_statistic]
1402 if not isinstance(interaction_statistic, list):
1403 return
1404 for is_e in interaction_statistic:
1405 if not isinstance(is_e, dict):
1406 continue
1407 if is_e.get('@type') != 'InteractionCounter':
1408 continue
1409 interaction_type = extract_interaction_type(is_e)
1410 if not interaction_type:
1411 continue
1412 # For interaction count some sites provide string instead of
1413 # an integer (as per spec) with non digit characters (e.g. ",")
1414 # so extracting count with more relaxed str_to_int
1415 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1416 if interaction_count is None:
1417 continue
1418 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1419 if not count_kind:
1420 continue
1421 count_key = '%s_count' % count_kind
1422 if info.get(count_key) is not None:
1423 continue
1424 info[count_key] = interaction_count
1425
1426 def extract_video_object(e):
1427 assert e['@type'] == 'VideoObject'
1428 author = e.get('author')
1429 info.update({
1430 'url': url_or_none(e.get('contentUrl')),
1431 'title': unescapeHTML(e.get('name')),
1432 'description': unescapeHTML(e.get('description')),
1433 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1434 'duration': parse_duration(e.get('duration')),
1435 'timestamp': unified_timestamp(e.get('uploadDate')),
1436 # author can be an instance of 'Organization' or 'Person' types.
1437 # both types can have 'name' property(inherited from 'Thing' type). [1]
1438 # however some websites are using 'Text' type instead.
1439 # 1. https://schema.org/VideoObject
1440 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1441 'filesize': float_or_none(e.get('contentSize')),
1442 'tbr': int_or_none(e.get('bitrate')),
1443 'width': int_or_none(e.get('width')),
1444 'height': int_or_none(e.get('height')),
1445 'view_count': int_or_none(e.get('interactionCount')),
1446 })
1447 extract_interaction_statistic(e)
1448
1449 for e in json_ld:
1450 if '@context' in e:
1451 item_type = e.get('@type')
1452 if expected_type is not None and expected_type != item_type:
1453 continue
1454 if item_type in ('TVEpisode', 'Episode'):
1455 episode_name = unescapeHTML(e.get('name'))
1456 info.update({
1457 'episode': episode_name,
1458 'episode_number': int_or_none(e.get('episodeNumber')),
1459 'description': unescapeHTML(e.get('description')),
1460 })
1461 if not info.get('title') and episode_name:
1462 info['title'] = episode_name
1463 part_of_season = e.get('partOfSeason')
1464 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1465 info.update({
1466 'season': unescapeHTML(part_of_season.get('name')),
1467 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1468 })
1469 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1470 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1471 info['series'] = unescapeHTML(part_of_series.get('name'))
1472 elif item_type == 'Movie':
1473 info.update({
1474 'title': unescapeHTML(e.get('name')),
1475 'description': unescapeHTML(e.get('description')),
1476 'duration': parse_duration(e.get('duration')),
1477 'timestamp': unified_timestamp(e.get('dateCreated')),
1478 })
1479 elif item_type in ('Article', 'NewsArticle'):
1480 info.update({
1481 'timestamp': parse_iso8601(e.get('datePublished')),
1482 'title': unescapeHTML(e.get('headline')),
1483 'description': unescapeHTML(e.get('articleBody')),
1484 })
1485 elif item_type == 'VideoObject':
1486 extract_video_object(e)
1487 if expected_type is None:
1488 continue
1489 else:
1490 break
1491 video = e.get('video')
1492 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1493 extract_video_object(video)
1494 if expected_type is None:
1495 continue
1496 else:
1497 break
1498 return dict((k, v) for k, v in info.items() if v is not None)
1499
1500 def _search_nextjs_data(self, webpage, video_id, **kw):
1501 return self._parse_json(
1502 self._search_regex(
1503 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1504 webpage, 'next.js data', **kw),
1505 video_id, **kw)
1506
1507 @staticmethod
1508 def _hidden_inputs(html):
1509 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1510 hidden_inputs = {}
1511 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1512 attrs = extract_attributes(input)
1513 if not input:
1514 continue
1515 if attrs.get('type') not in ('hidden', 'submit'):
1516 continue
1517 name = attrs.get('name') or attrs.get('id')
1518 value = attrs.get('value')
1519 if name and value is not None:
1520 hidden_inputs[name] = value
1521 return hidden_inputs
1522
1523 def _form_hidden_inputs(self, form_id, html):
1524 form = self._search_regex(
1525 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1526 html, '%s form' % form_id, group='form')
1527 return self._hidden_inputs(form)
1528
1529 class FormatSort:
1530 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1531
1532 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1533 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1534 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
1535 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1536 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1537 'fps', 'fs_approx', 'source', 'format_id')
1538
1539 settings = {
1540 'vcodec': {'type': 'ordered', 'regex': True,
1541 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1542 'acodec': {'type': 'ordered', 'regex': True,
1543 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1544 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1545 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1546 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1547 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1548 'vext': {'type': 'ordered', 'field': 'video_ext',
1549 'order': ('mp4', 'webm', 'flv', '', 'none'),
1550 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1551 'aext': {'type': 'ordered', 'field': 'audio_ext',
1552 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1553 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1554 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1555 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1556 'field': ('vcodec', 'acodec'),
1557 'function': lambda it: int(any(v != 'none' for v in it))},
1558 'ie_pref': {'priority': True, 'type': 'extractor'},
1559 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1560 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1561 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1562 'quality': {'convert': 'float', 'default': -1},
1563 'filesize': {'convert': 'bytes'},
1564 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1565 'id': {'convert': 'string', 'field': 'format_id'},
1566 'height': {'convert': 'float_none'},
1567 'width': {'convert': 'float_none'},
1568 'fps': {'convert': 'float_none'},
1569 'tbr': {'convert': 'float_none'},
1570 'vbr': {'convert': 'float_none'},
1571 'abr': {'convert': 'float_none'},
1572 'asr': {'convert': 'float_none'},
1573 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1574
1575 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1576 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1577 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1578 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1579 'res': {'type': 'multiple', 'field': ('height', 'width'),
1580 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1581
1582 # Most of these exist only for compatibility reasons
1583 'dimension': {'type': 'alias', 'field': 'res'},
1584 'resolution': {'type': 'alias', 'field': 'res'},
1585 'extension': {'type': 'alias', 'field': 'ext'},
1586 'bitrate': {'type': 'alias', 'field': 'br'},
1587 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1588 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1589 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1590 'framerate': {'type': 'alias', 'field': 'fps'},
1591 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1592 'protocol': {'type': 'alias', 'field': 'proto'},
1593 'source_preference': {'type': 'alias', 'field': 'source'},
1594 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1595 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1596 'samplerate': {'type': 'alias', 'field': 'asr'},
1597 'video_ext': {'type': 'alias', 'field': 'vext'},
1598 'audio_ext': {'type': 'alias', 'field': 'aext'},
1599 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1600 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1601 'video': {'type': 'alias', 'field': 'hasvid'},
1602 'has_video': {'type': 'alias', 'field': 'hasvid'},
1603 'audio': {'type': 'alias', 'field': 'hasaud'},
1604 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1605 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1606 'preference': {'type': 'alias', 'field': 'ie_pref'},
1607 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1608 'format_id': {'type': 'alias', 'field': 'id'},
1609 }
1610
1611 _order = []
1612
1613 def _get_field_setting(self, field, key):
1614 if field not in self.settings:
1615 self.settings[field] = {}
1616 propObj = self.settings[field]
1617 if key not in propObj:
1618 type = propObj.get('type')
1619 if key == 'field':
1620 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1621 elif key == 'convert':
1622 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1623 else:
1624 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1625 propObj[key] = default
1626 return propObj[key]
1627
1628 def _resolve_field_value(self, field, value, convertNone=False):
1629 if value is None:
1630 if not convertNone:
1631 return None
1632 else:
1633 value = value.lower()
1634 conversion = self._get_field_setting(field, 'convert')
1635 if conversion == 'ignore':
1636 return None
1637 if conversion == 'string':
1638 return value
1639 elif conversion == 'float_none':
1640 return float_or_none(value)
1641 elif conversion == 'bytes':
1642 return FileDownloader.parse_bytes(value)
1643 elif conversion == 'order':
1644 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1645 use_regex = self._get_field_setting(field, 'regex')
1646 list_length = len(order_list)
1647 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1648 if use_regex and value is not None:
1649 for i, regex in enumerate(order_list):
1650 if regex and re.match(regex, value):
1651 return list_length - i
1652 return list_length - empty_pos # not in list
1653 else: # not regex or value = None
1654 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1655 else:
1656 if value.isnumeric():
1657 return float(value)
1658 else:
1659 self.settings[field]['convert'] = 'string'
1660 return value
1661
1662 def evaluate_params(self, params, sort_extractor):
1663 self._use_free_order = params.get('prefer_free_formats', False)
1664 self._sort_user = params.get('format_sort', [])
1665 self._sort_extractor = sort_extractor
1666
1667 def add_item(field, reverse, closest, limit_text):
1668 field = field.lower()
1669 if field in self._order:
1670 return
1671 self._order.append(field)
1672 limit = self._resolve_field_value(field, limit_text)
1673 data = {
1674 'reverse': reverse,
1675 'closest': False if limit is None else closest,
1676 'limit_text': limit_text,
1677 'limit': limit}
1678 if field in self.settings:
1679 self.settings[field].update(data)
1680 else:
1681 self.settings[field] = data
1682
1683 sort_list = (
1684 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1685 + (tuple() if params.get('format_sort_force', False)
1686 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1687 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1688
1689 for item in sort_list:
1690 match = re.match(self.regex, item)
1691 if match is None:
1692 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1693 field = match.group('field')
1694 if field is None:
1695 continue
1696 if self._get_field_setting(field, 'type') == 'alias':
1697 field = self._get_field_setting(field, 'field')
1698 reverse = match.group('reverse') is not None
1699 closest = match.group('separator') == '~'
1700 limit_text = match.group('limit')
1701
1702 has_limit = limit_text is not None
1703 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1704 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1705
1706 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1707 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1708 limit_count = len(limits)
1709 for (i, f) in enumerate(fields):
1710 add_item(f, reverse, closest,
1711 limits[i] if i < limit_count
1712 else limits[0] if has_limit and not has_multiple_limits
1713 else None)
1714
1715 def print_verbose_info(self, write_debug):
1716 if self._sort_user:
1717 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1718 if self._sort_extractor:
1719 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1720 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1721 '+' if self._get_field_setting(field, 'reverse') else '', field,
1722 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1723 self._get_field_setting(field, 'limit_text'),
1724 self._get_field_setting(field, 'limit'))
1725 if self._get_field_setting(field, 'limit_text') is not None else '')
1726 for field in self._order if self._get_field_setting(field, 'visible')]))
1727
1728 def _calculate_field_preference_from_value(self, format, field, type, value):
1729 reverse = self._get_field_setting(field, 'reverse')
1730 closest = self._get_field_setting(field, 'closest')
1731 limit = self._get_field_setting(field, 'limit')
1732
1733 if type == 'extractor':
1734 maximum = self._get_field_setting(field, 'max')
1735 if value is None or (maximum is not None and value >= maximum):
1736 value = -1
1737 elif type == 'boolean':
1738 in_list = self._get_field_setting(field, 'in_list')
1739 not_in_list = self._get_field_setting(field, 'not_in_list')
1740 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1741 elif type == 'ordered':
1742 value = self._resolve_field_value(field, value, True)
1743
1744 # try to convert to number
1745 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1746 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1747 if is_num:
1748 value = val_num
1749
1750 return ((-10, 0) if value is None
1751 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1752 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1753 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1754 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1755 else (-1, value, 0))
1756
1757 def _calculate_field_preference(self, format, field):
1758 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1759 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1760 if type == 'multiple':
1761 type = 'field' # Only 'field' is allowed in multiple for now
1762 actual_fields = self._get_field_setting(field, 'field')
1763
1764 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1765 else:
1766 value = get_value(field)
1767 return self._calculate_field_preference_from_value(format, field, type, value)
1768
1769 def calculate_preference(self, format):
1770 # Determine missing protocol
1771 if not format.get('protocol'):
1772 format['protocol'] = determine_protocol(format)
1773
1774 # Determine missing ext
1775 if not format.get('ext') and 'url' in format:
1776 format['ext'] = determine_ext(format['url'])
1777 if format.get('vcodec') == 'none':
1778 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1779 format['video_ext'] = 'none'
1780 else:
1781 format['video_ext'] = format['ext']
1782 format['audio_ext'] = 'none'
1783 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1784 # format['preference'] = -1000
1785
1786 # Determine missing bitrates
1787 if format.get('tbr') is None:
1788 if format.get('vbr') is not None and format.get('abr') is not None:
1789 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1790 else:
1791 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1792 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1793 if format.get('acodec') != 'none' and format.get('abr') is None:
1794 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1795
1796 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1797
1798 def _sort_formats(self, formats, field_preference=[]):
1799 if not formats:
1800 return
1801 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1802 format_sort.evaluate_params(self._downloader.params, field_preference)
1803 if self.get_param('verbose', False):
1804 format_sort.print_verbose_info(self._downloader.write_debug)
1805 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1806
1807 def _check_formats(self, formats, video_id):
1808 if formats:
1809 formats[:] = filter(
1810 lambda f: self._is_valid_url(
1811 f['url'], video_id,
1812 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1813 formats)
1814
1815 @staticmethod
1816 def _remove_duplicate_formats(formats):
1817 format_urls = set()
1818 unique_formats = []
1819 for f in formats:
1820 if f['url'] not in format_urls:
1821 format_urls.add(f['url'])
1822 unique_formats.append(f)
1823 formats[:] = unique_formats
1824
1825 def _is_valid_url(self, url, video_id, item='video', headers={}):
1826 url = self._proto_relative_url(url, scheme='http:')
1827 # For now assume non HTTP(S) URLs always valid
1828 if not (url.startswith('http://') or url.startswith('https://')):
1829 return True
1830 try:
1831 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1832 return True
1833 except ExtractorError as e:
1834 self.to_screen(
1835 '%s: %s URL is invalid, skipping: %s'
1836 % (video_id, item, error_to_compat_str(e.cause)))
1837 return False
1838
1839 def http_scheme(self):
1840 """ Either "http:" or "https:", depending on the user's preferences """
1841 return (
1842 'http:'
1843 if self.get_param('prefer_insecure', False)
1844 else 'https:')
1845
1846 def _proto_relative_url(self, url, scheme=None):
1847 if url is None:
1848 return url
1849 if url.startswith('//'):
1850 if scheme is None:
1851 scheme = self.http_scheme()
1852 return scheme + url
1853 else:
1854 return url
1855
1856 def _sleep(self, timeout, video_id, msg_template=None):
1857 if msg_template is None:
1858 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1859 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1860 self.to_screen(msg)
1861 time.sleep(timeout)
1862
1863 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1864 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1865 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1866 manifest = self._download_xml(
1867 manifest_url, video_id, 'Downloading f4m manifest',
1868 'Unable to download f4m manifest',
1869 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1870 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1871 transform_source=transform_source,
1872 fatal=fatal, data=data, headers=headers, query=query)
1873
1874 if manifest is False:
1875 return []
1876
1877 return self._parse_f4m_formats(
1878 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1879 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1880
1881 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1882 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1883 fatal=True, m3u8_id=None):
1884 if not isinstance(manifest, compat_etree_Element) and not fatal:
1885 return []
1886
1887 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1888 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1889 if akamai_pv is not None and ';' in akamai_pv.text:
1890 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1891 if playerVerificationChallenge.strip() != '':
1892 return []
1893
1894 formats = []
1895 manifest_version = '1.0'
1896 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1897 if not media_nodes:
1898 manifest_version = '2.0'
1899 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1900 # Remove unsupported DRM protected media from final formats
1901 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1902 media_nodes = remove_encrypted_media(media_nodes)
1903 if not media_nodes:
1904 return formats
1905
1906 manifest_base_url = get_base_url(manifest)
1907
1908 bootstrap_info = xpath_element(
1909 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1910 'bootstrap info', default=None)
1911
1912 vcodec = None
1913 mime_type = xpath_text(
1914 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1915 'base URL', default=None)
1916 if mime_type and mime_type.startswith('audio/'):
1917 vcodec = 'none'
1918
1919 for i, media_el in enumerate(media_nodes):
1920 tbr = int_or_none(media_el.attrib.get('bitrate'))
1921 width = int_or_none(media_el.attrib.get('width'))
1922 height = int_or_none(media_el.attrib.get('height'))
1923 format_id = join_nonempty(f4m_id, tbr or i)
1924 # If <bootstrapInfo> is present, the specified f4m is a
1925 # stream-level manifest, and only set-level manifests may refer to
1926 # external resources. See section 11.4 and section 4 of F4M spec
1927 if bootstrap_info is None:
1928 media_url = None
1929 # @href is introduced in 2.0, see section 11.6 of F4M spec
1930 if manifest_version == '2.0':
1931 media_url = media_el.attrib.get('href')
1932 if media_url is None:
1933 media_url = media_el.attrib.get('url')
1934 if not media_url:
1935 continue
1936 manifest_url = (
1937 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1938 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1939 # If media_url is itself a f4m manifest do the recursive extraction
1940 # since bitrates in parent manifest (this one) and media_url manifest
1941 # may differ leading to inability to resolve the format by requested
1942 # bitrate in f4m downloader
1943 ext = determine_ext(manifest_url)
1944 if ext == 'f4m':
1945 f4m_formats = self._extract_f4m_formats(
1946 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1947 transform_source=transform_source, fatal=fatal)
1948 # Sometimes stream-level manifest contains single media entry that
1949 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1950 # At the same time parent's media entry in set-level manifest may
1951 # contain it. We will copy it from parent in such cases.
1952 if len(f4m_formats) == 1:
1953 f = f4m_formats[0]
1954 f.update({
1955 'tbr': f.get('tbr') or tbr,
1956 'width': f.get('width') or width,
1957 'height': f.get('height') or height,
1958 'format_id': f.get('format_id') if not tbr else format_id,
1959 'vcodec': vcodec,
1960 })
1961 formats.extend(f4m_formats)
1962 continue
1963 elif ext == 'm3u8':
1964 formats.extend(self._extract_m3u8_formats(
1965 manifest_url, video_id, 'mp4', preference=preference,
1966 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1967 continue
1968 formats.append({
1969 'format_id': format_id,
1970 'url': manifest_url,
1971 'manifest_url': manifest_url,
1972 'ext': 'flv' if bootstrap_info is not None else None,
1973 'protocol': 'f4m',
1974 'tbr': tbr,
1975 'width': width,
1976 'height': height,
1977 'vcodec': vcodec,
1978 'preference': preference,
1979 'quality': quality,
1980 })
1981 return formats
1982
1983 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1984 return {
1985 'format_id': join_nonempty(m3u8_id, 'meta'),
1986 'url': m3u8_url,
1987 'ext': ext,
1988 'protocol': 'm3u8',
1989 'preference': preference - 100 if preference else -100,
1990 'quality': quality,
1991 'resolution': 'multiple',
1992 'format_note': 'Quality selection URL',
1993 }
1994
1995 def _report_ignoring_subs(self, name):
1996 self.report_warning(bug_reports_message(
1997 f'Ignoring subtitle tracks found in the {name} manifest; '
1998 'if any subtitle tracks are missing,'
1999 ), only_once=True)
2000
2001 def _extract_m3u8_formats(self, *args, **kwargs):
2002 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2003 if subs:
2004 self._report_ignoring_subs('HLS')
2005 return fmts
2006
2007 def _extract_m3u8_formats_and_subtitles(
2008 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2009 preference=None, quality=None, m3u8_id=None, note=None,
2010 errnote=None, fatal=True, live=False, data=None, headers={},
2011 query={}):
2012
2013 res = self._download_webpage_handle(
2014 m3u8_url, video_id,
2015 note='Downloading m3u8 information' if note is None else note,
2016 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2017 fatal=fatal, data=data, headers=headers, query=query)
2018
2019 if res is False:
2020 return [], {}
2021
2022 m3u8_doc, urlh = res
2023 m3u8_url = urlh.geturl()
2024
2025 return self._parse_m3u8_formats_and_subtitles(
2026 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2027 preference=preference, quality=quality, m3u8_id=m3u8_id,
2028 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2029 headers=headers, query=query, video_id=video_id)
2030
2031 def _parse_m3u8_formats_and_subtitles(
2032 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2033 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2034 errnote=None, fatal=True, data=None, headers={}, query={},
2035 video_id=None):
2036 formats, subtitles = [], {}
2037
2038 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
2039 return formats, subtitles
2040
2041 has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
2042
2043 def format_url(url):
2044 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2045
2046 if self.get_param('hls_split_discontinuity', False):
2047 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2048 if not m3u8_doc:
2049 if not manifest_url:
2050 return []
2051 m3u8_doc = self._download_webpage(
2052 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2053 note=False, errnote='Failed to download m3u8 playlist information')
2054 if m3u8_doc is False:
2055 return []
2056 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2057
2058 else:
2059 def _extract_m3u8_playlist_indices(*args, **kwargs):
2060 return [None]
2061
2062 # References:
2063 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2064 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2065 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2066
2067 # We should try extracting formats only from master playlists [1, 4.3.4],
2068 # i.e. playlists that describe available qualities. On the other hand
2069 # media playlists [1, 4.3.3] should be returned as is since they contain
2070 # just the media without qualities renditions.
2071 # Fortunately, master playlist can be easily distinguished from media
2072 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2073 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2074 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2075 # media playlist and MUST NOT appear in master playlist thus we can
2076 # clearly detect media playlist with this criterion.
2077
2078 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2079 formats = [{
2080 'format_id': join_nonempty(m3u8_id, idx),
2081 'format_index': idx,
2082 'url': m3u8_url,
2083 'ext': ext,
2084 'protocol': entry_protocol,
2085 'preference': preference,
2086 'quality': quality,
2087 'has_drm': has_drm,
2088 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2089
2090 return formats, subtitles
2091
2092 groups = {}
2093 last_stream_inf = {}
2094
2095 def extract_media(x_media_line):
2096 media = parse_m3u8_attributes(x_media_line)
2097 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2098 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2099 if not (media_type and group_id and name):
2100 return
2101 groups.setdefault(group_id, []).append(media)
2102 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2103 if media_type == 'SUBTITLES':
2104 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2105 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2106 # However, lack of URI has been spotted in the wild.
2107 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2108 if not media.get('URI'):
2109 return
2110 url = format_url(media['URI'])
2111 sub_info = {
2112 'url': url,
2113 'ext': determine_ext(url),
2114 }
2115 if sub_info['ext'] == 'm3u8':
2116 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2117 # files may contain is WebVTT:
2118 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2119 sub_info['ext'] = 'vtt'
2120 sub_info['protocol'] = 'm3u8_native'
2121 lang = media.get('LANGUAGE') or 'und'
2122 subtitles.setdefault(lang, []).append(sub_info)
2123 if media_type not in ('VIDEO', 'AUDIO'):
2124 return
2125 media_url = media.get('URI')
2126 if media_url:
2127 manifest_url = format_url(media_url)
2128 formats.extend({
2129 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2130 'format_note': name,
2131 'format_index': idx,
2132 'url': manifest_url,
2133 'manifest_url': m3u8_url,
2134 'language': media.get('LANGUAGE'),
2135 'ext': ext,
2136 'protocol': entry_protocol,
2137 'preference': preference,
2138 'quality': quality,
2139 'vcodec': 'none' if media_type == 'AUDIO' else None,
2140 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2141
2142 def build_stream_name():
2143 # Despite specification does not mention NAME attribute for
2144 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2145 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2146 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2147 stream_name = last_stream_inf.get('NAME')
2148 if stream_name:
2149 return stream_name
2150 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2151 # from corresponding rendition group
2152 stream_group_id = last_stream_inf.get('VIDEO')
2153 if not stream_group_id:
2154 return
2155 stream_group = groups.get(stream_group_id)
2156 if not stream_group:
2157 return stream_group_id
2158 rendition = stream_group[0]
2159 return rendition.get('NAME') or stream_group_id
2160
2161 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2162 # chance to detect video only formats when EXT-X-STREAM-INF tags
2163 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2164 for line in m3u8_doc.splitlines():
2165 if line.startswith('#EXT-X-MEDIA:'):
2166 extract_media(line)
2167
2168 for line in m3u8_doc.splitlines():
2169 if line.startswith('#EXT-X-STREAM-INF:'):
2170 last_stream_inf = parse_m3u8_attributes(line)
2171 elif line.startswith('#') or not line.strip():
2172 continue
2173 else:
2174 tbr = float_or_none(
2175 last_stream_inf.get('AVERAGE-BANDWIDTH')
2176 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2177 manifest_url = format_url(line.strip())
2178
2179 for idx in _extract_m3u8_playlist_indices(manifest_url):
2180 format_id = [m3u8_id, None, idx]
2181 # Bandwidth of live streams may differ over time thus making
2182 # format_id unpredictable. So it's better to keep provided
2183 # format_id intact.
2184 if not live:
2185 stream_name = build_stream_name()
2186 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2187 f = {
2188 'format_id': join_nonempty(*format_id),
2189 'format_index': idx,
2190 'url': manifest_url,
2191 'manifest_url': m3u8_url,
2192 'tbr': tbr,
2193 'ext': ext,
2194 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2195 'protocol': entry_protocol,
2196 'preference': preference,
2197 'quality': quality,
2198 }
2199 resolution = last_stream_inf.get('RESOLUTION')
2200 if resolution:
2201 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2202 if mobj:
2203 f['width'] = int(mobj.group('width'))
2204 f['height'] = int(mobj.group('height'))
2205 # Unified Streaming Platform
2206 mobj = re.search(
2207 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2208 if mobj:
2209 abr, vbr = mobj.groups()
2210 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2211 f.update({
2212 'vbr': vbr,
2213 'abr': abr,
2214 })
2215 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2216 f.update(codecs)
2217 audio_group_id = last_stream_inf.get('AUDIO')
2218 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2219 # references a rendition group MUST have a CODECS attribute.
2220 # However, this is not always respected, for example, [2]
2221 # contains EXT-X-STREAM-INF tag which references AUDIO
2222 # rendition group but does not have CODECS and despite
2223 # referencing an audio group it represents a complete
2224 # (with audio and video) format. So, for such cases we will
2225 # ignore references to rendition groups and treat them
2226 # as complete formats.
2227 if audio_group_id and codecs and f.get('vcodec') != 'none':
2228 audio_group = groups.get(audio_group_id)
2229 if audio_group and audio_group[0].get('URI'):
2230 # TODO: update acodec for audio only formats with
2231 # the same GROUP-ID
2232 f['acodec'] = 'none'
2233 if not f.get('ext'):
2234 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2235 formats.append(f)
2236
2237 # for DailyMotion
2238 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2239 if progressive_uri:
2240 http_f = f.copy()
2241 del http_f['manifest_url']
2242 http_f.update({
2243 'format_id': f['format_id'].replace('hls-', 'http-'),
2244 'protocol': 'http',
2245 'url': progressive_uri,
2246 })
2247 formats.append(http_f)
2248
2249 last_stream_inf = {}
2250 return formats, subtitles
2251
2252 def _extract_m3u8_vod_duration(
2253 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2254
2255 m3u8_vod = self._download_webpage(
2256 m3u8_vod_url, video_id,
2257 note='Downloading m3u8 VOD manifest' if note is None else note,
2258 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2259 fatal=False, data=data, headers=headers, query=query)
2260
2261 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2262
2263 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2264 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2265 return None
2266
2267 return int(sum(
2268 float(line[len('#EXTINF:'):].split(',')[0])
2269 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2270
2271 @staticmethod
2272 def _xpath_ns(path, namespace=None):
2273 if not namespace:
2274 return path
2275 out = []
2276 for c in path.split('/'):
2277 if not c or c == '.':
2278 out.append(c)
2279 else:
2280 out.append('{%s}%s' % (namespace, c))
2281 return '/'.join(out)
2282
2283 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2284 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2285
2286 if smil is False:
2287 assert not fatal
2288 return []
2289
2290 namespace = self._parse_smil_namespace(smil)
2291
2292 fmts = self._parse_smil_formats(
2293 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2294 subs = self._parse_smil_subtitles(
2295 smil, namespace=namespace)
2296
2297 return fmts, subs
2298
2299 def _extract_smil_formats(self, *args, **kwargs):
2300 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2301 if subs:
2302 self._report_ignoring_subs('SMIL')
2303 return fmts
2304
2305 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2306 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2307 if smil is False:
2308 return {}
2309 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2310
2311 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2312 return self._download_xml(
2313 smil_url, video_id, 'Downloading SMIL file',
2314 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2315
2316 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2317 namespace = self._parse_smil_namespace(smil)
2318
2319 formats = self._parse_smil_formats(
2320 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2321 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2322
2323 video_id = os.path.splitext(url_basename(smil_url))[0]
2324 title = None
2325 description = None
2326 upload_date = None
2327 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2328 name = meta.attrib.get('name')
2329 content = meta.attrib.get('content')
2330 if not name or not content:
2331 continue
2332 if not title and name == 'title':
2333 title = content
2334 elif not description and name in ('description', 'abstract'):
2335 description = content
2336 elif not upload_date and name == 'date':
2337 upload_date = unified_strdate(content)
2338
2339 thumbnails = [{
2340 'id': image.get('type'),
2341 'url': image.get('src'),
2342 'width': int_or_none(image.get('width')),
2343 'height': int_or_none(image.get('height')),
2344 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2345
2346 return {
2347 'id': video_id,
2348 'title': title or video_id,
2349 'description': description,
2350 'upload_date': upload_date,
2351 'thumbnails': thumbnails,
2352 'formats': formats,
2353 'subtitles': subtitles,
2354 }
2355
2356 def _parse_smil_namespace(self, smil):
2357 return self._search_regex(
2358 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2359
2360 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2361 base = smil_url
2362 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2363 b = meta.get('base') or meta.get('httpBase')
2364 if b:
2365 base = b
2366 break
2367
2368 formats = []
2369 rtmp_count = 0
2370 http_count = 0
2371 m3u8_count = 0
2372 imgs_count = 0
2373
2374 srcs = set()
2375 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2376 for medium in media:
2377 src = medium.get('src')
2378 if not src or src in srcs:
2379 continue
2380 srcs.add(src)
2381
2382 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2383 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2384 width = int_or_none(medium.get('width'))
2385 height = int_or_none(medium.get('height'))
2386 proto = medium.get('proto')
2387 ext = medium.get('ext')
2388 src_ext = determine_ext(src)
2389 streamer = medium.get('streamer') or base
2390
2391 if proto == 'rtmp' or streamer.startswith('rtmp'):
2392 rtmp_count += 1
2393 formats.append({
2394 'url': streamer,
2395 'play_path': src,
2396 'ext': 'flv',
2397 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2398 'tbr': bitrate,
2399 'filesize': filesize,
2400 'width': width,
2401 'height': height,
2402 })
2403 if transform_rtmp_url:
2404 streamer, src = transform_rtmp_url(streamer, src)
2405 formats[-1].update({
2406 'url': streamer,
2407 'play_path': src,
2408 })
2409 continue
2410
2411 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2412 src_url = src_url.strip()
2413
2414 if proto == 'm3u8' or src_ext == 'm3u8':
2415 m3u8_formats = self._extract_m3u8_formats(
2416 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2417 if len(m3u8_formats) == 1:
2418 m3u8_count += 1
2419 m3u8_formats[0].update({
2420 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2421 'tbr': bitrate,
2422 'width': width,
2423 'height': height,
2424 })
2425 formats.extend(m3u8_formats)
2426 elif src_ext == 'f4m':
2427 f4m_url = src_url
2428 if not f4m_params:
2429 f4m_params = {
2430 'hdcore': '3.2.0',
2431 'plugin': 'flowplayer-3.2.0.1',
2432 }
2433 f4m_url += '&' if '?' in f4m_url else '?'
2434 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2435 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2436 elif src_ext == 'mpd':
2437 formats.extend(self._extract_mpd_formats(
2438 src_url, video_id, mpd_id='dash', fatal=False))
2439 elif re.search(r'\.ism/[Mm]anifest', src_url):
2440 formats.extend(self._extract_ism_formats(
2441 src_url, video_id, ism_id='mss', fatal=False))
2442 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2443 http_count += 1
2444 formats.append({
2445 'url': src_url,
2446 'ext': ext or src_ext or 'flv',
2447 'format_id': 'http-%d' % (bitrate or http_count),
2448 'tbr': bitrate,
2449 'filesize': filesize,
2450 'width': width,
2451 'height': height,
2452 })
2453
2454 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2455 src = medium.get('src')
2456 if not src or src in srcs:
2457 continue
2458 srcs.add(src)
2459
2460 imgs_count += 1
2461 formats.append({
2462 'format_id': 'imagestream-%d' % (imgs_count),
2463 'url': src,
2464 'ext': mimetype2ext(medium.get('type')),
2465 'acodec': 'none',
2466 'vcodec': 'none',
2467 'width': int_or_none(medium.get('width')),
2468 'height': int_or_none(medium.get('height')),
2469 'format_note': 'SMIL storyboards',
2470 })
2471
2472 return formats
2473
2474 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2475 urls = []
2476 subtitles = {}
2477 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2478 src = textstream.get('src')
2479 if not src or src in urls:
2480 continue
2481 urls.append(src)
2482 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2483 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2484 subtitles.setdefault(lang, []).append({
2485 'url': src,
2486 'ext': ext,
2487 })
2488 return subtitles
2489
2490 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2491 xspf = self._download_xml(
2492 xspf_url, playlist_id, 'Downloading xpsf playlist',
2493 'Unable to download xspf manifest', fatal=fatal)
2494 if xspf is False:
2495 return []
2496 return self._parse_xspf(
2497 xspf, playlist_id, xspf_url=xspf_url,
2498 xspf_base_url=base_url(xspf_url))
2499
2500 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2501 NS_MAP = {
2502 'xspf': 'http://xspf.org/ns/0/',
2503 's1': 'http://static.streamone.nl/player/ns/0',
2504 }
2505
2506 entries = []
2507 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2508 title = xpath_text(
2509 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2510 description = xpath_text(
2511 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2512 thumbnail = xpath_text(
2513 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2514 duration = float_or_none(
2515 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2516
2517 formats = []
2518 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2519 format_url = urljoin(xspf_base_url, location.text)
2520 if not format_url:
2521 continue
2522 formats.append({
2523 'url': format_url,
2524 'manifest_url': xspf_url,
2525 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2526 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2527 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2528 })
2529 self._sort_formats(formats)
2530
2531 entries.append({
2532 'id': playlist_id,
2533 'title': title,
2534 'description': description,
2535 'thumbnail': thumbnail,
2536 'duration': duration,
2537 'formats': formats,
2538 })
2539 return entries
2540
2541 def _extract_mpd_formats(self, *args, **kwargs):
2542 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2543 if subs:
2544 self._report_ignoring_subs('DASH')
2545 return fmts
2546
2547 def _extract_mpd_formats_and_subtitles(
2548 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2549 fatal=True, data=None, headers={}, query={}):
2550 res = self._download_xml_handle(
2551 mpd_url, video_id,
2552 note='Downloading MPD manifest' if note is None else note,
2553 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2554 fatal=fatal, data=data, headers=headers, query=query)
2555 if res is False:
2556 return [], {}
2557 mpd_doc, urlh = res
2558 if mpd_doc is None:
2559 return [], {}
2560 mpd_base_url = base_url(urlh.geturl())
2561
2562 return self._parse_mpd_formats_and_subtitles(
2563 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2564
2565 def _parse_mpd_formats(self, *args, **kwargs):
2566 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2567 if subs:
2568 self._report_ignoring_subs('DASH')
2569 return fmts
2570
2571 def _parse_mpd_formats_and_subtitles(
2572 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2573 """
2574 Parse formats from MPD manifest.
2575 References:
2576 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2577 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2578 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2579 """
2580 if not self.get_param('dynamic_mpd', True):
2581 if mpd_doc.get('type') == 'dynamic':
2582 return [], {}
2583
2584 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2585
2586 def _add_ns(path):
2587 return self._xpath_ns(path, namespace)
2588
2589 def is_drm_protected(element):
2590 return element.find(_add_ns('ContentProtection')) is not None
2591
2592 def extract_multisegment_info(element, ms_parent_info):
2593 ms_info = ms_parent_info.copy()
2594
2595 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2596 # common attributes and elements. We will only extract relevant
2597 # for us.
2598 def extract_common(source):
2599 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2600 if segment_timeline is not None:
2601 s_e = segment_timeline.findall(_add_ns('S'))
2602 if s_e:
2603 ms_info['total_number'] = 0
2604 ms_info['s'] = []
2605 for s in s_e:
2606 r = int(s.get('r', 0))
2607 ms_info['total_number'] += 1 + r
2608 ms_info['s'].append({
2609 't': int(s.get('t', 0)),
2610 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2611 'd': int(s.attrib['d']),
2612 'r': r,
2613 })
2614 start_number = source.get('startNumber')
2615 if start_number:
2616 ms_info['start_number'] = int(start_number)
2617 timescale = source.get('timescale')
2618 if timescale:
2619 ms_info['timescale'] = int(timescale)
2620 segment_duration = source.get('duration')
2621 if segment_duration:
2622 ms_info['segment_duration'] = float(segment_duration)
2623
2624 def extract_Initialization(source):
2625 initialization = source.find(_add_ns('Initialization'))
2626 if initialization is not None:
2627 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2628
2629 segment_list = element.find(_add_ns('SegmentList'))
2630 if segment_list is not None:
2631 extract_common(segment_list)
2632 extract_Initialization(segment_list)
2633 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2634 if segment_urls_e:
2635 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2636 else:
2637 segment_template = element.find(_add_ns('SegmentTemplate'))
2638 if segment_template is not None:
2639 extract_common(segment_template)
2640 media = segment_template.get('media')
2641 if media:
2642 ms_info['media'] = media
2643 initialization = segment_template.get('initialization')
2644 if initialization:
2645 ms_info['initialization'] = initialization
2646 else:
2647 extract_Initialization(segment_template)
2648 return ms_info
2649
2650 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2651 formats, subtitles = [], {}
2652 stream_numbers = {'audio': 0, 'video': 0}
2653 for period in mpd_doc.findall(_add_ns('Period')):
2654 period_duration = parse_duration(period.get('duration')) or mpd_duration
2655 period_ms_info = extract_multisegment_info(period, {
2656 'start_number': 1,
2657 'timescale': 1,
2658 })
2659 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2660 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2661 for representation in adaptation_set.findall(_add_ns('Representation')):
2662 representation_attrib = adaptation_set.attrib.copy()
2663 representation_attrib.update(representation.attrib)
2664 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2665 mime_type = representation_attrib['mimeType']
2666 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2667
2668 codecs = representation_attrib.get('codecs', '')
2669 if content_type not in ('video', 'audio', 'text'):
2670 if mime_type == 'image/jpeg':
2671 content_type = mime_type
2672 elif codecs.split('.')[0] == 'stpp':
2673 content_type = 'text'
2674 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2675 content_type = 'text'
2676 else:
2677 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2678 continue
2679
2680 base_url = ''
2681 for element in (representation, adaptation_set, period, mpd_doc):
2682 base_url_e = element.find(_add_ns('BaseURL'))
2683 if base_url_e is not None:
2684 base_url = base_url_e.text + base_url
2685 if re.match(r'^https?://', base_url):
2686 break
2687 if mpd_base_url and base_url.startswith('/'):
2688 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2689 elif mpd_base_url and not re.match(r'^https?://', base_url):
2690 if not mpd_base_url.endswith('/'):
2691 mpd_base_url += '/'
2692 base_url = mpd_base_url + base_url
2693 representation_id = representation_attrib.get('id')
2694 lang = representation_attrib.get('lang')
2695 url_el = representation.find(_add_ns('BaseURL'))
2696 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2697 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2698 if representation_id is not None:
2699 format_id = representation_id
2700 else:
2701 format_id = content_type
2702 if mpd_id:
2703 format_id = mpd_id + '-' + format_id
2704 if content_type in ('video', 'audio'):
2705 f = {
2706 'format_id': format_id,
2707 'manifest_url': mpd_url,
2708 'ext': mimetype2ext(mime_type),
2709 'width': int_or_none(representation_attrib.get('width')),
2710 'height': int_or_none(representation_attrib.get('height')),
2711 'tbr': float_or_none(bandwidth, 1000),
2712 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2713 'fps': int_or_none(representation_attrib.get('frameRate')),
2714 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2715 'format_note': 'DASH %s' % content_type,
2716 'filesize': filesize,
2717 'container': mimetype2ext(mime_type) + '_dash',
2718 'manifest_stream_number': stream_numbers[content_type]
2719 }
2720 f.update(parse_codecs(codecs))
2721 stream_numbers[content_type] += 1
2722 elif content_type == 'text':
2723 f = {
2724 'ext': mimetype2ext(mime_type),
2725 'manifest_url': mpd_url,
2726 'filesize': filesize,
2727 }
2728 elif content_type == 'image/jpeg':
2729 # See test case in VikiIE
2730 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2731 f = {
2732 'format_id': format_id,
2733 'ext': 'mhtml',
2734 'manifest_url': mpd_url,
2735 'format_note': 'DASH storyboards (jpeg)',
2736 'acodec': 'none',
2737 'vcodec': 'none',
2738 }
2739 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2740 f['has_drm'] = True
2741 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2742
2743 def prepare_template(template_name, identifiers):
2744 tmpl = representation_ms_info[template_name]
2745 # First of, % characters outside $...$ templates
2746 # must be escaped by doubling for proper processing
2747 # by % operator string formatting used further (see
2748 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2749 t = ''
2750 in_template = False
2751 for c in tmpl:
2752 t += c
2753 if c == '$':
2754 in_template = not in_template
2755 elif c == '%' and not in_template:
2756 t += c
2757 # Next, $...$ templates are translated to their
2758 # %(...) counterparts to be used with % operator
2759 if representation_id is not None:
2760 t = t.replace('$RepresentationID$', representation_id)
2761 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2762 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2763 t.replace('$$', '$')
2764 return t
2765
2766 # @initialization is a regular template like @media one
2767 # so it should be handled just the same way (see
2768 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2769 if 'initialization' in representation_ms_info:
2770 initialization_template = prepare_template(
2771 'initialization',
2772 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2773 # $Time$ shall not be included for @initialization thus
2774 # only $Bandwidth$ remains
2775 ('Bandwidth', ))
2776 representation_ms_info['initialization_url'] = initialization_template % {
2777 'Bandwidth': bandwidth,
2778 }
2779
2780 def location_key(location):
2781 return 'url' if re.match(r'^https?://', location) else 'path'
2782
2783 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2784
2785 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2786 media_location_key = location_key(media_template)
2787
2788 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2789 # can't be used at the same time
2790 if '%(Number' in media_template and 's' not in representation_ms_info:
2791 segment_duration = None
2792 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2793 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2794 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2795 representation_ms_info['fragments'] = [{
2796 media_location_key: media_template % {
2797 'Number': segment_number,
2798 'Bandwidth': bandwidth,
2799 },
2800 'duration': segment_duration,
2801 } for segment_number in range(
2802 representation_ms_info['start_number'],
2803 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2804 else:
2805 # $Number*$ or $Time$ in media template with S list available
2806 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2807 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2808 representation_ms_info['fragments'] = []
2809 segment_time = 0
2810 segment_d = None
2811 segment_number = representation_ms_info['start_number']
2812
2813 def add_segment_url():
2814 segment_url = media_template % {
2815 'Time': segment_time,
2816 'Bandwidth': bandwidth,
2817 'Number': segment_number,
2818 }
2819 representation_ms_info['fragments'].append({
2820 media_location_key: segment_url,
2821 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2822 })
2823
2824 for num, s in enumerate(representation_ms_info['s']):
2825 segment_time = s.get('t') or segment_time
2826 segment_d = s['d']
2827 add_segment_url()
2828 segment_number += 1
2829 for r in range(s.get('r', 0)):
2830 segment_time += segment_d
2831 add_segment_url()
2832 segment_number += 1
2833 segment_time += segment_d
2834 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2835 # No media template
2836 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2837 # or any YouTube dashsegments video
2838 fragments = []
2839 segment_index = 0
2840 timescale = representation_ms_info['timescale']
2841 for s in representation_ms_info['s']:
2842 duration = float_or_none(s['d'], timescale)
2843 for r in range(s.get('r', 0) + 1):
2844 segment_uri = representation_ms_info['segment_urls'][segment_index]
2845 fragments.append({
2846 location_key(segment_uri): segment_uri,
2847 'duration': duration,
2848 })
2849 segment_index += 1
2850 representation_ms_info['fragments'] = fragments
2851 elif 'segment_urls' in representation_ms_info:
2852 # Segment URLs with no SegmentTimeline
2853 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2854 # https://github.com/ytdl-org/youtube-dl/pull/14844
2855 fragments = []
2856 segment_duration = float_or_none(
2857 representation_ms_info['segment_duration'],
2858 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2859 for segment_url in representation_ms_info['segment_urls']:
2860 fragment = {
2861 location_key(segment_url): segment_url,
2862 }
2863 if segment_duration:
2864 fragment['duration'] = segment_duration
2865 fragments.append(fragment)
2866 representation_ms_info['fragments'] = fragments
2867 # If there is a fragments key available then we correctly recognized fragmented media.
2868 # Otherwise we will assume unfragmented media with direct access. Technically, such
2869 # assumption is not necessarily correct since we may simply have no support for
2870 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2871 if 'fragments' in representation_ms_info:
2872 f.update({
2873 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2874 'url': mpd_url or base_url,
2875 'fragment_base_url': base_url,
2876 'fragments': [],
2877 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2878 })
2879 if 'initialization_url' in representation_ms_info:
2880 initialization_url = representation_ms_info['initialization_url']
2881 if not f.get('url'):
2882 f['url'] = initialization_url
2883 f['fragments'].append({location_key(initialization_url): initialization_url})
2884 f['fragments'].extend(representation_ms_info['fragments'])
2885 else:
2886 # Assuming direct URL to unfragmented media.
2887 f['url'] = base_url
2888 if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2889 formats.append(f)
2890 elif content_type == 'text':
2891 subtitles.setdefault(lang or 'und', []).append(f)
2892
2893 return formats, subtitles
2894
2895 def _extract_ism_formats(self, *args, **kwargs):
2896 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2897 if subs:
2898 self._report_ignoring_subs('ISM')
2899 return fmts
2900
2901 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2902 res = self._download_xml_handle(
2903 ism_url, video_id,
2904 note='Downloading ISM manifest' if note is None else note,
2905 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2906 fatal=fatal, data=data, headers=headers, query=query)
2907 if res is False:
2908 return [], {}
2909 ism_doc, urlh = res
2910 if ism_doc is None:
2911 return [], {}
2912
2913 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2914
2915 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2916 """
2917 Parse formats from ISM manifest.
2918 References:
2919 1. [MS-SSTR]: Smooth Streaming Protocol,
2920 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2921 """
2922 if ism_doc.get('IsLive') == 'TRUE':
2923 return [], {}
2924
2925 duration = int(ism_doc.attrib['Duration'])
2926 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2927
2928 formats = []
2929 subtitles = {}
2930 for stream in ism_doc.findall('StreamIndex'):
2931 stream_type = stream.get('Type')
2932 if stream_type not in ('video', 'audio', 'text'):
2933 continue
2934 url_pattern = stream.attrib['Url']
2935 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2936 stream_name = stream.get('Name')
2937 stream_language = stream.get('Language', 'und')
2938 for track in stream.findall('QualityLevel'):
2939 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2940 # TODO: add support for WVC1 and WMAP
2941 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2942 self.report_warning('%s is not a supported codec' % fourcc)
2943 continue
2944 tbr = int(track.attrib['Bitrate']) // 1000
2945 # [1] does not mention Width and Height attributes. However,
2946 # they're often present while MaxWidth and MaxHeight are
2947 # missing, so should be used as fallbacks
2948 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2949 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2950 sampling_rate = int_or_none(track.get('SamplingRate'))
2951
2952 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2953 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2954
2955 fragments = []
2956 fragment_ctx = {
2957 'time': 0,
2958 }
2959 stream_fragments = stream.findall('c')
2960 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2961 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2962 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2963 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2964 if not fragment_ctx['duration']:
2965 try:
2966 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2967 except IndexError:
2968 next_fragment_time = duration
2969 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2970 for _ in range(fragment_repeat):
2971 fragments.append({
2972 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2973 'duration': fragment_ctx['duration'] / stream_timescale,
2974 })
2975 fragment_ctx['time'] += fragment_ctx['duration']
2976
2977 if stream_type == 'text':
2978 subtitles.setdefault(stream_language, []).append({
2979 'ext': 'ismt',
2980 'protocol': 'ism',
2981 'url': ism_url,
2982 'manifest_url': ism_url,
2983 'fragments': fragments,
2984 '_download_params': {
2985 'stream_type': stream_type,
2986 'duration': duration,
2987 'timescale': stream_timescale,
2988 'fourcc': fourcc,
2989 'language': stream_language,
2990 'codec_private_data': track.get('CodecPrivateData'),
2991 }
2992 })
2993 elif stream_type in ('video', 'audio'):
2994 formats.append({
2995 'format_id': join_nonempty(ism_id, stream_name, tbr),
2996 'url': ism_url,
2997 'manifest_url': ism_url,
2998 'ext': 'ismv' if stream_type == 'video' else 'isma',
2999 'width': width,
3000 'height': height,
3001 'tbr': tbr,
3002 'asr': sampling_rate,
3003 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3004 'acodec': 'none' if stream_type == 'video' else fourcc,
3005 'protocol': 'ism',
3006 'fragments': fragments,
3007 'has_drm': ism_doc.find('Protection') is not None,
3008 '_download_params': {
3009 'stream_type': stream_type,
3010 'duration': duration,
3011 'timescale': stream_timescale,
3012 'width': width or 0,
3013 'height': height or 0,
3014 'fourcc': fourcc,
3015 'language': stream_language,
3016 'codec_private_data': track.get('CodecPrivateData'),
3017 'sampling_rate': sampling_rate,
3018 'channels': int_or_none(track.get('Channels', 2)),
3019 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3020 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3021 },
3022 })
3023 return formats, subtitles
3024
3025 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3026 def absolute_url(item_url):
3027 return urljoin(base_url, item_url)
3028
3029 def parse_content_type(content_type):
3030 if not content_type:
3031 return {}
3032 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3033 if ctr:
3034 mimetype, codecs = ctr.groups()
3035 f = parse_codecs(codecs)
3036 f['ext'] = mimetype2ext(mimetype)
3037 return f
3038 return {}
3039
3040 def _media_formats(src, cur_media_type, type_info={}):
3041 full_url = absolute_url(src)
3042 ext = type_info.get('ext') or determine_ext(full_url)
3043 if ext == 'm3u8':
3044 is_plain_url = False
3045 formats = self._extract_m3u8_formats(
3046 full_url, video_id, ext='mp4',
3047 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3048 preference=preference, quality=quality, fatal=False)
3049 elif ext == 'mpd':
3050 is_plain_url = False
3051 formats = self._extract_mpd_formats(
3052 full_url, video_id, mpd_id=mpd_id, fatal=False)
3053 else:
3054 is_plain_url = True
3055 formats = [{
3056 'url': full_url,
3057 'vcodec': 'none' if cur_media_type == 'audio' else None,
3058 }]
3059 return is_plain_url, formats
3060
3061 entries = []
3062 # amp-video and amp-audio are very similar to their HTML5 counterparts
3063 # so we wll include them right here (see
3064 # https://www.ampproject.org/docs/reference/components/amp-video)
3065 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3066 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3067 media_tags = [(media_tag, media_tag_name, media_type, '')
3068 for media_tag, media_tag_name, media_type
3069 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3070 media_tags.extend(re.findall(
3071 # We only allow video|audio followed by a whitespace or '>'.
3072 # Allowing more characters may end up in significant slow down (see
3073 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3074 # http://www.porntrex.com/maps/videositemap.xml).
3075 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3076 for media_tag, _, media_type, media_content in media_tags:
3077 media_info = {
3078 'formats': [],
3079 'subtitles': {},
3080 }
3081 media_attributes = extract_attributes(media_tag)
3082 src = strip_or_none(media_attributes.get('src'))
3083 if src:
3084 _, formats = _media_formats(src, media_type)
3085 media_info['formats'].extend(formats)
3086 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3087 if media_content:
3088 for source_tag in re.findall(r'<source[^>]+>', media_content):
3089 s_attr = extract_attributes(source_tag)
3090 # data-video-src and data-src are non standard but seen
3091 # several times in the wild
3092 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3093 if not src:
3094 continue
3095 f = parse_content_type(s_attr.get('type'))
3096 is_plain_url, formats = _media_formats(src, media_type, f)
3097 if is_plain_url:
3098 # width, height, res, label and title attributes are
3099 # all not standard but seen several times in the wild
3100 labels = [
3101 s_attr.get(lbl)
3102 for lbl in ('label', 'title')
3103 if str_or_none(s_attr.get(lbl))
3104 ]
3105 width = int_or_none(s_attr.get('width'))
3106 height = (int_or_none(s_attr.get('height'))
3107 or int_or_none(s_attr.get('res')))
3108 if not width or not height:
3109 for lbl in labels:
3110 resolution = parse_resolution(lbl)
3111 if not resolution:
3112 continue
3113 width = width or resolution.get('width')
3114 height = height or resolution.get('height')
3115 for lbl in labels:
3116 tbr = parse_bitrate(lbl)
3117 if tbr:
3118 break
3119 else:
3120 tbr = None
3121 f.update({
3122 'width': width,
3123 'height': height,
3124 'tbr': tbr,
3125 'format_id': s_attr.get('label') or s_attr.get('title'),
3126 })
3127 f.update(formats[0])
3128 media_info['formats'].append(f)
3129 else:
3130 media_info['formats'].extend(formats)
3131 for track_tag in re.findall(r'<track[^>]+>', media_content):
3132 track_attributes = extract_attributes(track_tag)
3133 kind = track_attributes.get('kind')
3134 if not kind or kind in ('subtitles', 'captions'):
3135 src = strip_or_none(track_attributes.get('src'))
3136 if not src:
3137 continue
3138 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3139 media_info['subtitles'].setdefault(lang, []).append({
3140 'url': absolute_url(src),
3141 })
3142 for f in media_info['formats']:
3143 f.setdefault('http_headers', {})['Referer'] = base_url
3144 if media_info['formats'] or media_info['subtitles']:
3145 entries.append(media_info)
3146 return entries
3147
3148 def _extract_akamai_formats(self, *args, **kwargs):
3149 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3150 if subs:
3151 self._report_ignoring_subs('akamai')
3152 return fmts
3153
3154 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3155 signed = 'hdnea=' in manifest_url
3156 if not signed:
3157 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3158 manifest_url = re.sub(
3159 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3160 '', manifest_url).strip('?')
3161
3162 formats = []
3163 subtitles = {}
3164
3165 hdcore_sign = 'hdcore=3.7.0'
3166 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3167 hds_host = hosts.get('hds')
3168 if hds_host:
3169 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3170 if 'hdcore=' not in f4m_url:
3171 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3172 f4m_formats = self._extract_f4m_formats(
3173 f4m_url, video_id, f4m_id='hds', fatal=False)
3174 for entry in f4m_formats:
3175 entry.update({'extra_param_to_segment_url': hdcore_sign})
3176 formats.extend(f4m_formats)
3177
3178 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3179 hls_host = hosts.get('hls')
3180 if hls_host:
3181 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3182 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3183 m3u8_url, video_id, 'mp4', 'm3u8_native',
3184 m3u8_id='hls', fatal=False)
3185 formats.extend(m3u8_formats)
3186 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3187
3188 http_host = hosts.get('http')
3189 if http_host and m3u8_formats and not signed:
3190 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3191 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3192 qualities_length = len(qualities)
3193 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3194 i = 0
3195 for f in m3u8_formats:
3196 if f['vcodec'] != 'none':
3197 for protocol in ('http', 'https'):
3198 http_f = f.copy()
3199 del http_f['manifest_url']
3200 http_url = re.sub(
3201 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3202 http_f.update({
3203 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3204 'url': http_url,
3205 'protocol': protocol,
3206 })
3207 formats.append(http_f)
3208 i += 1
3209
3210 return formats, subtitles
3211
3212 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3213 query = compat_urlparse.urlparse(url).query
3214 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3215 mobj = re.search(
3216 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3217 url_base = mobj.group('url')
3218 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3219 formats = []
3220
3221 def manifest_url(manifest):
3222 m_url = '%s/%s' % (http_base_url, manifest)
3223 if query:
3224 m_url += '?%s' % query
3225 return m_url
3226
3227 if 'm3u8' not in skip_protocols:
3228 formats.extend(self._extract_m3u8_formats(
3229 manifest_url('playlist.m3u8'), video_id, 'mp4',
3230 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3231 if 'f4m' not in skip_protocols:
3232 formats.extend(self._extract_f4m_formats(
3233 manifest_url('manifest.f4m'),
3234 video_id, f4m_id='hds', fatal=False))
3235 if 'dash' not in skip_protocols:
3236 formats.extend(self._extract_mpd_formats(
3237 manifest_url('manifest.mpd'),
3238 video_id, mpd_id='dash', fatal=False))
3239 if re.search(r'(?:/smil:|\.smil)', url_base):
3240 if 'smil' not in skip_protocols:
3241 rtmp_formats = self._extract_smil_formats(
3242 manifest_url('jwplayer.smil'),
3243 video_id, fatal=False)
3244 for rtmp_format in rtmp_formats:
3245 rtsp_format = rtmp_format.copy()
3246 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3247 del rtsp_format['play_path']
3248 del rtsp_format['ext']
3249 rtsp_format.update({
3250 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3251 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3252 'protocol': 'rtsp',
3253 })
3254 formats.extend([rtmp_format, rtsp_format])
3255 else:
3256 for protocol in ('rtmp', 'rtsp'):
3257 if protocol not in skip_protocols:
3258 formats.append({
3259 'url': '%s:%s' % (protocol, url_base),
3260 'format_id': protocol,
3261 'protocol': protocol,
3262 })
3263 return formats
3264
3265 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3266 mobj = re.search(
3267 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3268 webpage)
3269 if mobj:
3270 try:
3271 jwplayer_data = self._parse_json(mobj.group('options'),
3272 video_id=video_id,
3273 transform_source=transform_source)
3274 except ExtractorError:
3275 pass
3276 else:
3277 if isinstance(jwplayer_data, dict):
3278 return jwplayer_data
3279
3280 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3281 jwplayer_data = self._find_jwplayer_data(
3282 webpage, video_id, transform_source=js_to_json)
3283 return self._parse_jwplayer_data(
3284 jwplayer_data, video_id, *args, **kwargs)
3285
3286 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3287 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3288 # JWPlayer backward compatibility: flattened playlists
3289 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3290 if 'playlist' not in jwplayer_data:
3291 jwplayer_data = {'playlist': [jwplayer_data]}
3292
3293 entries = []
3294
3295 # JWPlayer backward compatibility: single playlist item
3296 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3297 if not isinstance(jwplayer_data['playlist'], list):
3298 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3299
3300 for video_data in jwplayer_data['playlist']:
3301 # JWPlayer backward compatibility: flattened sources
3302 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3303 if 'sources' not in video_data:
3304 video_data['sources'] = [video_data]
3305
3306 this_video_id = video_id or video_data['mediaid']
3307
3308 formats = self._parse_jwplayer_formats(
3309 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3310 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3311
3312 subtitles = {}
3313 tracks = video_data.get('tracks')
3314 if tracks and isinstance(tracks, list):
3315 for track in tracks:
3316 if not isinstance(track, dict):
3317 continue
3318 track_kind = track.get('kind')
3319 if not track_kind or not isinstance(track_kind, compat_str):
3320 continue
3321 if track_kind.lower() not in ('captions', 'subtitles'):
3322 continue
3323 track_url = urljoin(base_url, track.get('file'))
3324 if not track_url:
3325 continue
3326 subtitles.setdefault(track.get('label') or 'en', []).append({
3327 'url': self._proto_relative_url(track_url)
3328 })
3329
3330 entry = {
3331 'id': this_video_id,
3332 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3333 'description': clean_html(video_data.get('description')),
3334 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3335 'timestamp': int_or_none(video_data.get('pubdate')),
3336 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3337 'subtitles': subtitles,
3338 }
3339 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3340 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3341 entry.update({
3342 '_type': 'url_transparent',
3343 'url': formats[0]['url'],
3344 })
3345 else:
3346 self._sort_formats(formats)
3347 entry['formats'] = formats
3348 entries.append(entry)
3349 if len(entries) == 1:
3350 return entries[0]
3351 else:
3352 return self.playlist_result(entries)
3353
3354 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3355 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3356 urls = []
3357 formats = []
3358 for source in jwplayer_sources_data:
3359 if not isinstance(source, dict):
3360 continue
3361 source_url = urljoin(
3362 base_url, self._proto_relative_url(source.get('file')))
3363 if not source_url or source_url in urls:
3364 continue
3365 urls.append(source_url)
3366 source_type = source.get('type') or ''
3367 ext = mimetype2ext(source_type) or determine_ext(source_url)
3368 if source_type == 'hls' or ext == 'm3u8':
3369 formats.extend(self._extract_m3u8_formats(
3370 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3371 m3u8_id=m3u8_id, fatal=False))
3372 elif source_type == 'dash' or ext == 'mpd':
3373 formats.extend(self._extract_mpd_formats(
3374 source_url, video_id, mpd_id=mpd_id, fatal=False))
3375 elif ext == 'smil':
3376 formats.extend(self._extract_smil_formats(
3377 source_url, video_id, fatal=False))
3378 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3379 elif source_type.startswith('audio') or ext in (
3380 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3381 formats.append({
3382 'url': source_url,
3383 'vcodec': 'none',
3384 'ext': ext,
3385 })
3386 else:
3387 height = int_or_none(source.get('height'))
3388 if height is None:
3389 # Often no height is provided but there is a label in
3390 # format like "1080p", "720p SD", or 1080.
3391 height = int_or_none(self._search_regex(
3392 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3393 'height', default=None))
3394 a_format = {
3395 'url': source_url,
3396 'width': int_or_none(source.get('width')),
3397 'height': height,
3398 'tbr': int_or_none(source.get('bitrate')),
3399 'ext': ext,
3400 }
3401 if source_url.startswith('rtmp'):
3402 a_format['ext'] = 'flv'
3403 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3404 # of jwplayer.flash.swf
3405 rtmp_url_parts = re.split(
3406 r'((?:mp4|mp3|flv):)', source_url, 1)
3407 if len(rtmp_url_parts) == 3:
3408 rtmp_url, prefix, play_path = rtmp_url_parts
3409 a_format.update({
3410 'url': rtmp_url,
3411 'play_path': prefix + play_path,
3412 })
3413 if rtmp_params:
3414 a_format.update(rtmp_params)
3415 formats.append(a_format)
3416 return formats
3417
3418 def _live_title(self, name):
3419 """ Generate the title for a live video """
3420 now = datetime.datetime.now()
3421 now_str = now.strftime('%Y-%m-%d %H:%M')
3422 return name + ' ' + now_str
3423
3424 def _int(self, v, name, fatal=False, **kwargs):
3425 res = int_or_none(v, **kwargs)
3426 if 'get_attr' in kwargs:
3427 print(getattr(v, kwargs['get_attr']))
3428 if res is None:
3429 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3430 if fatal:
3431 raise ExtractorError(msg)
3432 else:
3433 self.report_warning(msg)
3434 return res
3435
3436 def _float(self, v, name, fatal=False, **kwargs):
3437 res = float_or_none(v, **kwargs)
3438 if res is None:
3439 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3440 if fatal:
3441 raise ExtractorError(msg)
3442 else:
3443 self.report_warning(msg)
3444 return res
3445
3446 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3447 path='/', secure=False, discard=False, rest={}, **kwargs):
3448 cookie = compat_cookiejar_Cookie(
3449 0, name, value, port, port is not None, domain, True,
3450 domain.startswith('.'), path, True, secure, expire_time,
3451 discard, None, None, rest)
3452 self._downloader.cookiejar.set_cookie(cookie)
3453
3454 def _get_cookies(self, url):
3455 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3456 req = sanitized_Request(url)
3457 self._downloader.cookiejar.add_cookie_header(req)
3458 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3459
3460 def _apply_first_set_cookie_header(self, url_handle, cookie):
3461 """
3462 Apply first Set-Cookie header instead of the last. Experimental.
3463
3464 Some sites (e.g. [1-3]) may serve two cookies under the same name
3465 in Set-Cookie header and expect the first (old) one to be set rather
3466 than second (new). However, as of RFC6265 the newer one cookie
3467 should be set into cookie store what actually happens.
3468 We will workaround this issue by resetting the cookie to
3469 the first one manually.
3470 1. https://new.vk.com/
3471 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3472 3. https://learning.oreilly.com/
3473 """
3474 for header, cookies in url_handle.headers.items():
3475 if header.lower() != 'set-cookie':
3476 continue
3477 if sys.version_info[0] >= 3:
3478 cookies = cookies.encode('iso-8859-1')
3479 cookies = cookies.decode('utf-8')
3480 cookie_value = re.search(
3481 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3482 if cookie_value:
3483 value, domain = cookie_value.groups()
3484 self._set_cookie(domain, cookie, value)
3485 break
3486
3487 def get_testcases(self, include_onlymatching=False):
3488 t = getattr(self, '_TEST', None)
3489 if t:
3490 assert not hasattr(self, '_TESTS'), \
3491 '%s has _TEST and _TESTS' % type(self).__name__
3492 tests = [t]
3493 else:
3494 tests = getattr(self, '_TESTS', [])
3495 for t in tests:
3496 if not include_onlymatching and t.get('only_matching', False):
3497 continue
3498 t['name'] = type(self).__name__[:-len('IE')]
3499 yield t
3500
3501 def is_suitable(self, age_limit):
3502 """ Test whether the extractor is generally suitable for the given
3503 age limit (i.e. pornographic sites are not, all others usually are) """
3504
3505 any_restricted = False
3506 for tc in self.get_testcases(include_onlymatching=False):
3507 if tc.get('playlist', []):
3508 tc = tc['playlist'][0]
3509 is_restricted = age_restricted(
3510 tc.get('info_dict', {}).get('age_limit'), age_limit)
3511 if not is_restricted:
3512 return True
3513 any_restricted = any_restricted or is_restricted
3514 return not any_restricted
3515
3516 def extract_subtitles(self, *args, **kwargs):
3517 if (self.get_param('writesubtitles', False)
3518 or self.get_param('listsubtitles')):
3519 return self._get_subtitles(*args, **kwargs)
3520 return {}
3521
3522 def _get_subtitles(self, *args, **kwargs):
3523 raise NotImplementedError('This method must be implemented by subclasses')
3524
3525 def extract_comments(self, *args, **kwargs):
3526 if not self.get_param('getcomments'):
3527 return None
3528 generator = self._get_comments(*args, **kwargs)
3529
3530 def extractor():
3531 comments = []
3532 try:
3533 while True:
3534 comments.append(next(generator))
3535 except KeyboardInterrupt:
3536 interrupted = True
3537 self.to_screen('Interrupted by user')
3538 except StopIteration:
3539 interrupted = False
3540 comment_count = len(comments)
3541 self.to_screen(f'Extracted {comment_count} comments')
3542 return {
3543 'comments': comments,
3544 'comment_count': None if interrupted else comment_count
3545 }
3546 return extractor
3547
3548 def _get_comments(self, *args, **kwargs):
3549 raise NotImplementedError('This method must be implemented by subclasses')
3550
3551 @staticmethod
3552 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3553 """ Merge subtitle items for one language. Items with duplicated URLs
3554 will be dropped. """
3555 list1_urls = set([item['url'] for item in subtitle_list1])
3556 ret = list(subtitle_list1)
3557 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3558 return ret
3559
3560 @classmethod
3561 def _merge_subtitles(cls, *dicts, target=None):
3562 """ Merge subtitle dictionaries, language by language. """
3563 if target is None:
3564 target = {}
3565 for d in dicts:
3566 for lang, subs in d.items():
3567 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3568 return target
3569
3570 def extract_automatic_captions(self, *args, **kwargs):
3571 if (self.get_param('writeautomaticsub', False)
3572 or self.get_param('listsubtitles')):
3573 return self._get_automatic_captions(*args, **kwargs)
3574 return {}
3575
3576 def _get_automatic_captions(self, *args, **kwargs):
3577 raise NotImplementedError('This method must be implemented by subclasses')
3578
3579 def mark_watched(self, *args, **kwargs):
3580 if not self.get_param('mark_watched', False):
3581 return
3582 if (self._get_login_info()[0] is not None
3583 or self.get_param('cookiefile')
3584 or self.get_param('cookiesfrombrowser')):
3585 self._mark_watched(*args, **kwargs)
3586
3587 def _mark_watched(self, *args, **kwargs):
3588 raise NotImplementedError('This method must be implemented by subclasses')
3589
3590 def geo_verification_headers(self):
3591 headers = {}
3592 geo_verification_proxy = self.get_param('geo_verification_proxy')
3593 if geo_verification_proxy:
3594 headers['Ytdl-request-proxy'] = geo_verification_proxy
3595 return headers
3596
3597 def _generic_id(self, url):
3598 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3599
3600 def _generic_title(self, url):
3601 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3602
3603 @staticmethod
3604 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3605 all_known = all(map(
3606 lambda x: x is not None,
3607 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3608 return (
3609 'private' if is_private
3610 else 'premium_only' if needs_premium
3611 else 'subscriber_only' if needs_subscription
3612 else 'needs_auth' if needs_auth
3613 else 'unlisted' if is_unlisted
3614 else 'public' if all_known
3615 else None)
3616
3617 def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3618 '''
3619 @returns A list of values for the extractor argument given by "key"
3620 or "default" if no such key is present
3621 @param default The default value to return when the key is not present (default: [])
3622 @param casesense When false, the values are converted to lower case
3623 '''
3624 val = traverse_obj(
3625 self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3626 if val is None:
3627 return [] if default is NO_DEFAULT else default
3628 return list(val) if casesense else [x.lower() for x in val]
3629
3630
3631 class SearchInfoExtractor(InfoExtractor):
3632 """
3633 Base class for paged search queries extractors.
3634 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3635 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3636 """
3637
3638 _MAX_RESULTS = float('inf')
3639
3640 @classmethod
3641 def _make_valid_url(cls):
3642 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3643
3644 @classmethod
3645 def suitable(cls, url):
3646 return re.match(cls._make_valid_url(), url) is not None
3647
3648 def _real_extract(self, query):
3649 mobj = re.match(self._make_valid_url(), query)
3650 if mobj is None:
3651 raise ExtractorError('Invalid search query "%s"' % query)
3652
3653 prefix = mobj.group('prefix')
3654 query = mobj.group('query')
3655 if prefix == '':
3656 return self._get_n_results(query, 1)
3657 elif prefix == 'all':
3658 return self._get_n_results(query, self._MAX_RESULTS)
3659 else:
3660 n = int(prefix)
3661 if n <= 0:
3662 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3663 elif n > self._MAX_RESULTS:
3664 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3665 n = self._MAX_RESULTS
3666 return self._get_n_results(query, n)
3667
3668 def _get_n_results(self, query, n):
3669 """Get a specified number of results for a query.
3670 Either this function or _search_results must be overridden by subclasses """
3671 return self.playlist_result(
3672 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3673 query, query)
3674
3675 def _search_results(self, query):
3676 """Returns an iterator of search results"""
3677 raise NotImplementedError('This method must be implemented by subclasses')
3678
3679 @property
3680 def SEARCH_KEY(self):
3681 return self._SEARCH_KEY