]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[extractor] Support default implicit graph in JSON-LD (#1983)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import collections
6 import hashlib
7 import itertools
8 import json
9 import netrc
10 import os
11 import random
12 import re
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar_Cookie,
19 compat_cookies_SimpleCookie,
20 compat_etree_Element,
21 compat_etree_fromstring,
22 compat_expanduser,
23 compat_getpass,
24 compat_http_client,
25 compat_os_name,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
31 compat_urlparse,
32 compat_xml_parse_error,
33 )
34 from ..downloader import FileDownloader
35 from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38 )
39 from ..utils import (
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 dict_get,
48 error_to_compat_str,
49 extract_attributes,
50 ExtractorError,
51 fix_xml_ampersands,
52 float_or_none,
53 format_field,
54 GeoRestrictedError,
55 GeoUtils,
56 int_or_none,
57 join_nonempty,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 network_exceptions,
62 NO_DEFAULT,
63 orderedSet,
64 parse_bitrate,
65 parse_codecs,
66 parse_duration,
67 parse_iso8601,
68 parse_m3u8_attributes,
69 parse_resolution,
70 RegexNotFoundError,
71 sanitize_filename,
72 sanitized_Request,
73 str_or_none,
74 str_to_int,
75 strip_or_none,
76 traverse_obj,
77 unescapeHTML,
78 UnsupportedError,
79 unified_strdate,
80 unified_timestamp,
81 update_Request,
82 update_url_query,
83 url_basename,
84 url_or_none,
85 urljoin,
86 variadic,
87 xpath_element,
88 xpath_text,
89 xpath_with_ns,
90 )
91
92
93 class InfoExtractor(object):
94 """Information Extractor class.
95
96 Information extractors are the classes that, given a URL, extract
97 information about the video (or videos) the URL refers to. This
98 information includes the real video URL, the video title, author and
99 others. The information is stored in a dictionary which is then
100 passed to the YoutubeDL. The YoutubeDL processes this
101 information possibly downloading the video to the file system, among
102 other possible outcomes.
103
104 The type field determines the type of the result.
105 By far the most common value (and the default if _type is missing) is
106 "video", which indicates a single video.
107
108 For a video, the dictionaries must include the following fields:
109
110 id: Video identifier.
111 title: Video title, unescaped.
112
113 Additionally, it must contain either a formats entry or a url one:
114
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
129 is parsed from a string (in case of
130 fragmented media)
131 for MSS - URL of the ISM manifest.
132 * manifest_url
133 The URL of the manifest file in case of
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
139 * ext Will be calculated from URL if missing
140 * format A human-readable description of the format
141 ("mp4 container with h264/opus").
142 Calculated from the format_id, width, height.
143 and format_note fields if missing.
144 * format_id A short description of the format
145 ("mp4_h264_opus" or "19").
146 Technically optional, but strongly recommended.
147 * format_note Additional info about the format
148 ("3D" or "DASH video")
149 * width Width of the video, if known
150 * height Height of the video, if known
151 * resolution Textual description of width and height
152 * dynamic_range The dynamic range of the video. One of:
153 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
154 * tbr Average bitrate of audio and video in KBit/s
155 * abr Average audio bitrate in KBit/s
156 * acodec Name of the audio codec in use
157 * asr Audio sampling rate in Hertz
158 * vbr Average video bitrate in KBit/s
159 * fps Frame rate
160 * vcodec Name of the video codec in use
161 * container Name of the container format
162 * filesize The number of bytes, if known in advance
163 * filesize_approx An estimate for the number of bytes
164 * player_url SWF Player URL (used for rtmpdump).
165 * protocol The protocol that will be used for the actual
166 download, lower-case.
167 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
168 "m3u8", "m3u8_native" or "http_dash_segments".
169 * fragment_base_url
170 Base URL for fragments. Each fragment's path
171 value (if present) will be relative to
172 this URL.
173 * fragments A list of fragments of a fragmented media.
174 Each fragment entry must contain either an url
175 or a path. If an url is present it should be
176 considered by a client. Otherwise both path and
177 fragment_base_url must be present. Here is
178 the list of all potential fields:
179 * "url" - fragment's URL
180 * "path" - fragment's path relative to
181 fragment_base_url
182 * "duration" (optional, int or float)
183 * "filesize" (optional, int)
184 * preference Order number of this format. If this field is
185 present and not None, the formats get sorted
186 by this field, regardless of all other values.
187 -1 for default (order by other properties),
188 -2 or smaller for less than default.
189 < -1000 to hide the format (if there is
190 another one which is strictly better)
191 * language Language code, e.g. "de" or "en-US".
192 * language_preference Is this in the language mentioned in
193 the URL?
194 10 if it's what the URL is about,
195 -1 for default (don't know),
196 -10 otherwise, other values reserved for now.
197 * quality Order number of the video quality of this
198 format, irrespective of the file format.
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
201 * source_preference Order number for this video source
202 (quality takes higher priority)
203 -1 for default (order by other properties),
204 -2 or smaller for less than default.
205 * http_headers A dictionary of additional HTTP headers
206 to add to the request.
207 * stretched_ratio If given and not 1, indicates that the
208 video's pixels are not square.
209 width : height ratio as float.
210 * no_resume The server does not support resuming the
211 (HTTP or RTMP) download. Boolean.
212 * has_drm The format has DRM and cannot be downloaded. Boolean
213 * downloader_options A dictionary of downloader options as
214 described in FileDownloader
215 RTMP formats can also have the additional fields: page_url,
216 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
217 rtmp_protocol, rtmp_real_time
218
219 url: Final video URL.
220 ext: Video filename extension.
221 format: The video format, defaults to ext (used for --get-format)
222 player_url: SWF Player URL (used for rtmpdump).
223
224 The following fields are optional:
225
226 alt_title: A secondary title of the video.
227 display_id An alternative identifier for the video, not necessarily
228 unique, but available before title. Typically, id is
229 something like "4234987", title "Dancing naked mole rats",
230 and display_id "dancing-naked-mole-rats"
231 thumbnails: A list of dictionaries, with the following entries:
232 * "id" (optional, string) - Thumbnail format ID
233 * "url"
234 * "preference" (optional, int) - quality of the image
235 * "width" (optional, int)
236 * "height" (optional, int)
237 * "resolution" (optional, string "{width}x{height}",
238 deprecated)
239 * "filesize" (optional, int)
240 thumbnail: Full URL to a video thumbnail image.
241 description: Full video description.
242 uploader: Full name of the video uploader.
243 license: License name the video is licensed under.
244 creator: The creator of the video.
245 release_timestamp: UNIX timestamp of the moment the video was released.
246 release_date: The date (YYYYMMDD) when the video was released.
247 timestamp: UNIX timestamp of the moment the video was uploaded
248 upload_date: Video upload date (YYYYMMDD).
249 If not explicitly set, calculated from timestamp.
250 uploader_id: Nickname or id of the video uploader.
251 uploader_url: Full URL to a personal webpage of the video uploader.
252 channel: Full name of the channel the video is uploaded on.
253 Note that channel fields may or may not repeat uploader
254 fields. This depends on a particular extractor.
255 channel_id: Id of the channel.
256 channel_url: Full URL to a channel webpage.
257 location: Physical location where the video was filmed.
258 subtitles: The available subtitles as a dictionary in the format
259 {tag: subformats}. "tag" is usually a language code, and
260 "subformats" is a list sorted from lower to higher
261 preference, each element is a dictionary with the "ext"
262 entry and one of:
263 * "data": The subtitles file contents
264 * "url": A URL pointing to the subtitles file
265 It can optionally also have:
266 * "name": Name or description of the subtitles
267 "ext" will be calculated from URL if missing
268 automatic_captions: Like 'subtitles'; contains automatically generated
269 captions instead of normal subtitles
270 duration: Length of the video in seconds, as an integer or float.
271 view_count: How many users have watched the video on the platform.
272 like_count: Number of positive ratings of the video
273 dislike_count: Number of negative ratings of the video
274 repost_count: Number of reposts of the video
275 average_rating: Average rating give by users, the scale used depends on the webpage
276 comment_count: Number of comments on the video
277 comments: A list of comments, each with one or more of the following
278 properties (all but one of text or html optional):
279 * "author" - human-readable name of the comment author
280 * "author_id" - user ID of the comment author
281 * "author_thumbnail" - The thumbnail of the comment author
282 * "id" - Comment ID
283 * "html" - Comment as HTML
284 * "text" - Plain text of the comment
285 * "timestamp" - UNIX timestamp of comment
286 * "parent" - ID of the comment this one is replying to.
287 Set to "root" to indicate that this is a
288 comment to the original video.
289 * "like_count" - Number of positive ratings of the comment
290 * "dislike_count" - Number of negative ratings of the comment
291 * "is_favorited" - Whether the comment is marked as
292 favorite by the video uploader
293 * "author_is_uploader" - Whether the comment is made by
294 the video uploader
295 age_limit: Age restriction for the video, as an integer (years)
296 webpage_url: The URL to the video webpage, if given to yt-dlp it
297 should allow to get the same result again. (It will be set
298 by YoutubeDL if it's missing)
299 categories: A list of categories that the video falls in, for example
300 ["Sports", "Berlin"]
301 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
302 cast: A list of the video cast
303 is_live: True, False, or None (=unknown). Whether this video is a
304 live stream that goes on instead of a fixed-length video.
305 was_live: True, False, or None (=unknown). Whether this video was
306 originally a live stream.
307 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
308 If absent, automatically set from is_live, was_live
309 start_time: Time in seconds where the reproduction should start, as
310 specified in the URL.
311 end_time: Time in seconds where the reproduction should end, as
312 specified in the URL.
313 chapters: A list of dictionaries, with the following entries:
314 * "start_time" - The start time of the chapter in seconds
315 * "end_time" - The end time of the chapter in seconds
316 * "title" (optional, string)
317 playable_in_embed: Whether this video is allowed to play in embedded
318 players on other sites. Can be True (=always allowed),
319 False (=never allowed), None (=unknown), or a string
320 specifying the criteria for embedability (Eg: 'whitelist')
321 availability: Under what condition the video is available. One of
322 'private', 'premium_only', 'subscriber_only', 'needs_auth',
323 'unlisted' or 'public'. Use 'InfoExtractor._availability'
324 to set it
325 __post_extractor: A function to be called just before the metadata is
326 written to either disk, logger or console. The function
327 must return a dict which will be added to the info_dict.
328 This is usefull for additional information that is
329 time-consuming to extract. Note that the fields thus
330 extracted will not be available to output template and
331 match_filter. So, only "comments" and "comment_count" are
332 currently allowed to be extracted via this method.
333
334 The following fields should only be used when the video belongs to some logical
335 chapter or section:
336
337 chapter: Name or title of the chapter the video belongs to.
338 chapter_number: Number of the chapter the video belongs to, as an integer.
339 chapter_id: Id of the chapter the video belongs to, as a unicode string.
340
341 The following fields should only be used when the video is an episode of some
342 series, programme or podcast:
343
344 series: Title of the series or programme the video episode belongs to.
345 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
346 season: Title of the season the video episode belongs to.
347 season_number: Number of the season the video episode belongs to, as an integer.
348 season_id: Id of the season the video episode belongs to, as a unicode string.
349 episode: Title of the video episode. Unlike mandatory video title field,
350 this field should denote the exact title of the video episode
351 without any kind of decoration.
352 episode_number: Number of the video episode within a season, as an integer.
353 episode_id: Id of the video episode, as a unicode string.
354
355 The following fields should only be used when the media is a track or a part of
356 a music album:
357
358 track: Title of the track.
359 track_number: Number of the track within an album or a disc, as an integer.
360 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
361 as a unicode string.
362 artist: Artist(s) of the track.
363 genre: Genre(s) of the track.
364 album: Title of the album the track belongs to.
365 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
366 album_artist: List of all artists appeared on the album (e.g.
367 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
368 and compilations).
369 disc_number: Number of the disc or other physical medium the track belongs to,
370 as an integer.
371 release_year: Year (YYYY) when the album was released.
372
373 Unless mentioned otherwise, the fields should be Unicode strings.
374
375 Unless mentioned otherwise, None is equivalent to absence of information.
376
377
378 _type "playlist" indicates multiple videos.
379 There must be a key "entries", which is a list, an iterable, or a PagedList
380 object, each element of which is a valid dictionary by this specification.
381
382 Additionally, playlists can have "id", "title", and any other relevent
383 attributes with the same semantics as videos (see above).
384
385
386 _type "multi_video" indicates that there are multiple videos that
387 form a single show, for examples multiple acts of an opera or TV episode.
388 It must have an entries key like a playlist and contain all the keys
389 required for a video at the same time.
390
391
392 _type "url" indicates that the video must be extracted from another
393 location, possibly by a different extractor. Its only required key is:
394 "url" - the next URL to extract.
395 The key "ie_key" can be set to the class name (minus the trailing "IE",
396 e.g. "Youtube") if the extractor class is known in advance.
397 Additionally, the dictionary may have any properties of the resolved entity
398 known in advance, for example "title" if the title of the referred video is
399 known ahead of time.
400
401
402 _type "url_transparent" entities have the same specification as "url", but
403 indicate that the given additional information is more precise than the one
404 associated with the resolved URL.
405 This is useful when a site employs a video service that hosts the video and
406 its technical metadata, but that video service does not embed a useful
407 title, description etc.
408
409
410 Subclasses of this one should re-define the _real_initialize() and
411 _real_extract() methods and define a _VALID_URL regexp.
412 Probably, they should also be added to the list of extractors.
413
414 Subclasses may also override suitable() if necessary, but ensure the function
415 signature is preserved and that this function imports everything it needs
416 (except other extractors), so that lazy_extractors works correctly
417
418 _GEO_BYPASS attribute may be set to False in order to disable
419 geo restriction bypass mechanisms for a particular extractor.
420 Though it won't disable explicit geo restriction bypass based on
421 country code provided with geo_bypass_country.
422
423 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
424 countries for this extractor. One of these countries will be used by
425 geo restriction bypass mechanism right away in order to bypass
426 geo restriction, of course, if the mechanism is not disabled.
427
428 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
429 IP blocks in CIDR notation for this extractor. One of these IP blocks
430 will be used by geo restriction bypass mechanism similarly
431 to _GEO_COUNTRIES.
432
433 The _WORKING attribute should be set to False for broken IEs
434 in order to warn the users and skip the tests.
435 """
436
437 _ready = False
438 _downloader = None
439 _x_forwarded_for_ip = None
440 _GEO_BYPASS = True
441 _GEO_COUNTRIES = None
442 _GEO_IP_BLOCKS = None
443 _WORKING = True
444
445 _LOGIN_HINTS = {
446 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
447 'cookies': (
448 'Use --cookies-from-browser or --cookies for the authentication. '
449 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
450 'password': 'Use --username and --password, or --netrc to provide account credentials',
451 }
452
453 def __init__(self, downloader=None):
454 """Constructor. Receives an optional downloader (a YoutubeDL instance).
455 If a downloader is not passed during initialization,
456 it must be set using "set_downloader()" before "extract()" is called"""
457 self._ready = False
458 self._x_forwarded_for_ip = None
459 self._printed_messages = set()
460 self.set_downloader(downloader)
461
462 @classmethod
463 def _match_valid_url(cls, url):
464 # This does not use has/getattr intentionally - we want to know whether
465 # we have cached the regexp for *this* class, whereas getattr would also
466 # match the superclass
467 if '_VALID_URL_RE' not in cls.__dict__:
468 if '_VALID_URL' not in cls.__dict__:
469 cls._VALID_URL = cls._make_valid_url()
470 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
471 return cls._VALID_URL_RE.match(url)
472
473 @classmethod
474 def suitable(cls, url):
475 """Receives a URL and returns True if suitable for this IE."""
476 # This function must import everything it needs (except other extractors),
477 # so that lazy_extractors works correctly
478 return cls._match_valid_url(url) is not None
479
480 @classmethod
481 def _match_id(cls, url):
482 return cls._match_valid_url(url).group('id')
483
484 @classmethod
485 def get_temp_id(cls, url):
486 try:
487 return cls._match_id(url)
488 except (IndexError, AttributeError):
489 return None
490
491 @classmethod
492 def working(cls):
493 """Getter method for _WORKING."""
494 return cls._WORKING
495
496 def initialize(self):
497 """Initializes an instance (authentication, etc)."""
498 self._printed_messages = set()
499 self._initialize_geo_bypass({
500 'countries': self._GEO_COUNTRIES,
501 'ip_blocks': self._GEO_IP_BLOCKS,
502 })
503 if not self._ready:
504 self._real_initialize()
505 self._ready = True
506
507 def _initialize_geo_bypass(self, geo_bypass_context):
508 """
509 Initialize geo restriction bypass mechanism.
510
511 This method is used to initialize geo bypass mechanism based on faking
512 X-Forwarded-For HTTP header. A random country from provided country list
513 is selected and a random IP belonging to this country is generated. This
514 IP will be passed as X-Forwarded-For HTTP header in all subsequent
515 HTTP requests.
516
517 This method will be used for initial geo bypass mechanism initialization
518 during the instance initialization with _GEO_COUNTRIES and
519 _GEO_IP_BLOCKS.
520
521 You may also manually call it from extractor's code if geo bypass
522 information is not available beforehand (e.g. obtained during
523 extraction) or due to some other reason. In this case you should pass
524 this information in geo bypass context passed as first argument. It may
525 contain following fields:
526
527 countries: List of geo unrestricted countries (similar
528 to _GEO_COUNTRIES)
529 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
530 (similar to _GEO_IP_BLOCKS)
531
532 """
533 if not self._x_forwarded_for_ip:
534
535 # Geo bypass mechanism is explicitly disabled by user
536 if not self.get_param('geo_bypass', True):
537 return
538
539 if not geo_bypass_context:
540 geo_bypass_context = {}
541
542 # Backward compatibility: previously _initialize_geo_bypass
543 # expected a list of countries, some 3rd party code may still use
544 # it this way
545 if isinstance(geo_bypass_context, (list, tuple)):
546 geo_bypass_context = {
547 'countries': geo_bypass_context,
548 }
549
550 # The whole point of geo bypass mechanism is to fake IP
551 # as X-Forwarded-For HTTP header based on some IP block or
552 # country code.
553
554 # Path 1: bypassing based on IP block in CIDR notation
555
556 # Explicit IP block specified by user, use it right away
557 # regardless of whether extractor is geo bypassable or not
558 ip_block = self.get_param('geo_bypass_ip_block', None)
559
560 # Otherwise use random IP block from geo bypass context but only
561 # if extractor is known as geo bypassable
562 if not ip_block:
563 ip_blocks = geo_bypass_context.get('ip_blocks')
564 if self._GEO_BYPASS and ip_blocks:
565 ip_block = random.choice(ip_blocks)
566
567 if ip_block:
568 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
569 self._downloader.write_debug(
570 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
571 return
572
573 # Path 2: bypassing based on country code
574
575 # Explicit country code specified by user, use it right away
576 # regardless of whether extractor is geo bypassable or not
577 country = self.get_param('geo_bypass_country', None)
578
579 # Otherwise use random country code from geo bypass context but
580 # only if extractor is known as geo bypassable
581 if not country:
582 countries = geo_bypass_context.get('countries')
583 if self._GEO_BYPASS and countries:
584 country = random.choice(countries)
585
586 if country:
587 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
588 self._downloader.write_debug(
589 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
590
591 def extract(self, url):
592 """Extracts URL information and returns it in list of dicts."""
593 try:
594 for _ in range(2):
595 try:
596 self.initialize()
597 self.write_debug('Extracting URL: %s' % url)
598 ie_result = self._real_extract(url)
599 if ie_result is None:
600 return None
601 if self._x_forwarded_for_ip:
602 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
603 subtitles = ie_result.get('subtitles')
604 if (subtitles and 'live_chat' in subtitles
605 and 'no-live-chat' in self.get_param('compat_opts', [])):
606 del subtitles['live_chat']
607 return ie_result
608 except GeoRestrictedError as e:
609 if self.__maybe_fake_ip_and_retry(e.countries):
610 continue
611 raise
612 except UnsupportedError:
613 raise
614 except ExtractorError as e:
615 kwargs = {
616 'video_id': e.video_id or self.get_temp_id(url),
617 'ie': self.IE_NAME,
618 'tb': e.traceback,
619 'expected': e.expected,
620 'cause': e.cause
621 }
622 if hasattr(e, 'countries'):
623 kwargs['countries'] = e.countries
624 raise type(e)(e.msg, **kwargs)
625 except compat_http_client.IncompleteRead as e:
626 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
627 except (KeyError, StopIteration) as e:
628 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
629
630 def __maybe_fake_ip_and_retry(self, countries):
631 if (not self.get_param('geo_bypass_country', None)
632 and self._GEO_BYPASS
633 and self.get_param('geo_bypass', True)
634 and not self._x_forwarded_for_ip
635 and countries):
636 country_code = random.choice(countries)
637 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
638 if self._x_forwarded_for_ip:
639 self.report_warning(
640 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
641 % (self._x_forwarded_for_ip, country_code.upper()))
642 return True
643 return False
644
645 def set_downloader(self, downloader):
646 """Sets the downloader for this IE."""
647 self._downloader = downloader
648
649 def _real_initialize(self):
650 """Real initialization process. Redefine in subclasses."""
651 pass
652
653 def _real_extract(self, url):
654 """Real extraction process. Redefine in subclasses."""
655 pass
656
657 @classmethod
658 def ie_key(cls):
659 """A string for getting the InfoExtractor with get_info_extractor"""
660 return cls.__name__[:-2]
661
662 @property
663 def IE_NAME(self):
664 return compat_str(type(self).__name__[:-2])
665
666 @staticmethod
667 def __can_accept_status_code(err, expected_status):
668 assert isinstance(err, compat_urllib_error.HTTPError)
669 if expected_status is None:
670 return False
671 elif callable(expected_status):
672 return expected_status(err.code) is True
673 else:
674 return err.code in variadic(expected_status)
675
676 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
677 """
678 Return the response handle.
679
680 See _download_webpage docstring for arguments specification.
681 """
682 if not self._downloader._first_webpage_request:
683 sleep_interval = self.get_param('sleep_interval_requests') or 0
684 if sleep_interval > 0:
685 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
686 time.sleep(sleep_interval)
687 else:
688 self._downloader._first_webpage_request = False
689
690 if note is None:
691 self.report_download_webpage(video_id)
692 elif note is not False:
693 if video_id is None:
694 self.to_screen('%s' % (note,))
695 else:
696 self.to_screen('%s: %s' % (video_id, note))
697
698 # Some sites check X-Forwarded-For HTTP header in order to figure out
699 # the origin of the client behind proxy. This allows bypassing geo
700 # restriction by faking this header's value to IP that belongs to some
701 # geo unrestricted country. We will do so once we encounter any
702 # geo restriction error.
703 if self._x_forwarded_for_ip:
704 if 'X-Forwarded-For' not in headers:
705 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
706
707 if isinstance(url_or_request, compat_urllib_request.Request):
708 url_or_request = update_Request(
709 url_or_request, data=data, headers=headers, query=query)
710 else:
711 if query:
712 url_or_request = update_url_query(url_or_request, query)
713 if data is not None or headers:
714 url_or_request = sanitized_Request(url_or_request, data, headers)
715 try:
716 return self._downloader.urlopen(url_or_request)
717 except network_exceptions as err:
718 if isinstance(err, compat_urllib_error.HTTPError):
719 if self.__can_accept_status_code(err, expected_status):
720 # Retain reference to error to prevent file object from
721 # being closed before it can be read. Works around the
722 # effects of <https://bugs.python.org/issue15002>
723 # introduced in Python 3.4.1.
724 err.fp._error = err
725 return err.fp
726
727 if errnote is False:
728 return False
729 if errnote is None:
730 errnote = 'Unable to download webpage'
731
732 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
733 if fatal:
734 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
735 else:
736 self.report_warning(errmsg)
737 return False
738
739 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
740 """
741 Return a tuple (page content as string, URL handle).
742
743 See _download_webpage docstring for arguments specification.
744 """
745 # Strip hashes from the URL (#1038)
746 if isinstance(url_or_request, (compat_str, str)):
747 url_or_request = url_or_request.partition('#')[0]
748
749 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
750 if urlh is False:
751 assert not fatal
752 return False
753 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
754 return (content, urlh)
755
756 @staticmethod
757 def _guess_encoding_from_content(content_type, webpage_bytes):
758 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
759 if m:
760 encoding = m.group(1)
761 else:
762 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
763 webpage_bytes[:1024])
764 if m:
765 encoding = m.group(1).decode('ascii')
766 elif webpage_bytes.startswith(b'\xff\xfe'):
767 encoding = 'utf-16'
768 else:
769 encoding = 'utf-8'
770
771 return encoding
772
773 def __check_blocked(self, content):
774 first_block = content[:512]
775 if ('<title>Access to this site is blocked</title>' in content
776 and 'Websense' in first_block):
777 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
778 blocked_iframe = self._html_search_regex(
779 r'<iframe src="([^"]+)"', content,
780 'Websense information URL', default=None)
781 if blocked_iframe:
782 msg += ' Visit %s for more details' % blocked_iframe
783 raise ExtractorError(msg, expected=True)
784 if '<title>The URL you requested has been blocked</title>' in first_block:
785 msg = (
786 'Access to this webpage has been blocked by Indian censorship. '
787 'Use a VPN or proxy server (with --proxy) to route around it.')
788 block_msg = self._html_search_regex(
789 r'</h1><p>(.*?)</p>',
790 content, 'block message', default=None)
791 if block_msg:
792 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
793 raise ExtractorError(msg, expected=True)
794 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
795 and 'blocklist.rkn.gov.ru' in content):
796 raise ExtractorError(
797 'Access to this webpage has been blocked by decision of the Russian government. '
798 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
799 expected=True)
800
801 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
802 content_type = urlh.headers.get('Content-Type', '')
803 webpage_bytes = urlh.read()
804 if prefix is not None:
805 webpage_bytes = prefix + webpage_bytes
806 if not encoding:
807 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
808 if self.get_param('dump_intermediate_pages', False):
809 self.to_screen('Dumping request to ' + urlh.geturl())
810 dump = base64.b64encode(webpage_bytes).decode('ascii')
811 self._downloader.to_screen(dump)
812 if self.get_param('write_pages', False):
813 basen = '%s_%s' % (video_id, urlh.geturl())
814 trim_length = self.get_param('trim_file_name') or 240
815 if len(basen) > trim_length:
816 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
817 basen = basen[:trim_length - len(h)] + h
818 raw_filename = basen + '.dump'
819 filename = sanitize_filename(raw_filename, restricted=True)
820 self.to_screen('Saving request to ' + filename)
821 # Working around MAX_PATH limitation on Windows (see
822 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
823 if compat_os_name == 'nt':
824 absfilepath = os.path.abspath(filename)
825 if len(absfilepath) > 259:
826 filename = '\\\\?\\' + absfilepath
827 with open(filename, 'wb') as outf:
828 outf.write(webpage_bytes)
829
830 try:
831 content = webpage_bytes.decode(encoding, 'replace')
832 except LookupError:
833 content = webpage_bytes.decode('utf-8', 'replace')
834
835 self.__check_blocked(content)
836
837 return content
838
839 def _download_webpage(
840 self, url_or_request, video_id, note=None, errnote=None,
841 fatal=True, tries=1, timeout=5, encoding=None, data=None,
842 headers={}, query={}, expected_status=None):
843 """
844 Return the data of the page as a string.
845
846 Arguments:
847 url_or_request -- plain text URL as a string or
848 a compat_urllib_request.Requestobject
849 video_id -- Video/playlist/item identifier (string)
850
851 Keyword arguments:
852 note -- note printed before downloading (string)
853 errnote -- note printed in case of an error (string)
854 fatal -- flag denoting whether error should be considered fatal,
855 i.e. whether it should cause ExtractionError to be raised,
856 otherwise a warning will be reported and extraction continued
857 tries -- number of tries
858 timeout -- sleep interval between tries
859 encoding -- encoding for a page content decoding, guessed automatically
860 when not explicitly specified
861 data -- POST data (bytes)
862 headers -- HTTP headers (dict)
863 query -- URL query (dict)
864 expected_status -- allows to accept failed HTTP requests (non 2xx
865 status code) by explicitly specifying a set of accepted status
866 codes. Can be any of the following entities:
867 - an integer type specifying an exact failed status code to
868 accept
869 - a list or a tuple of integer types specifying a list of
870 failed status codes to accept
871 - a callable accepting an actual failed status code and
872 returning True if it should be accepted
873 Note that this argument does not affect success status codes (2xx)
874 which are always accepted.
875 """
876
877 success = False
878 try_count = 0
879 while success is False:
880 try:
881 res = self._download_webpage_handle(
882 url_or_request, video_id, note, errnote, fatal,
883 encoding=encoding, data=data, headers=headers, query=query,
884 expected_status=expected_status)
885 success = True
886 except compat_http_client.IncompleteRead as e:
887 try_count += 1
888 if try_count >= tries:
889 raise e
890 self._sleep(timeout, video_id)
891 if res is False:
892 return res
893 else:
894 content, _ = res
895 return content
896
897 def _download_xml_handle(
898 self, url_or_request, video_id, note='Downloading XML',
899 errnote='Unable to download XML', transform_source=None,
900 fatal=True, encoding=None, data=None, headers={}, query={},
901 expected_status=None):
902 """
903 Return a tuple (xml as an compat_etree_Element, URL handle).
904
905 See _download_webpage docstring for arguments specification.
906 """
907 res = self._download_webpage_handle(
908 url_or_request, video_id, note, errnote, fatal=fatal,
909 encoding=encoding, data=data, headers=headers, query=query,
910 expected_status=expected_status)
911 if res is False:
912 return res
913 xml_string, urlh = res
914 return self._parse_xml(
915 xml_string, video_id, transform_source=transform_source,
916 fatal=fatal), urlh
917
918 def _download_xml(
919 self, url_or_request, video_id,
920 note='Downloading XML', errnote='Unable to download XML',
921 transform_source=None, fatal=True, encoding=None,
922 data=None, headers={}, query={}, expected_status=None):
923 """
924 Return the xml as an compat_etree_Element.
925
926 See _download_webpage docstring for arguments specification.
927 """
928 res = self._download_xml_handle(
929 url_or_request, video_id, note=note, errnote=errnote,
930 transform_source=transform_source, fatal=fatal, encoding=encoding,
931 data=data, headers=headers, query=query,
932 expected_status=expected_status)
933 return res if res is False else res[0]
934
935 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
936 if transform_source:
937 xml_string = transform_source(xml_string)
938 try:
939 return compat_etree_fromstring(xml_string.encode('utf-8'))
940 except compat_xml_parse_error as ve:
941 errmsg = '%s: Failed to parse XML ' % video_id
942 if fatal:
943 raise ExtractorError(errmsg, cause=ve)
944 else:
945 self.report_warning(errmsg + str(ve))
946
947 def _download_json_handle(
948 self, url_or_request, video_id, note='Downloading JSON metadata',
949 errnote='Unable to download JSON metadata', transform_source=None,
950 fatal=True, encoding=None, data=None, headers={}, query={},
951 expected_status=None):
952 """
953 Return a tuple (JSON object, URL handle).
954
955 See _download_webpage docstring for arguments specification.
956 """
957 res = self._download_webpage_handle(
958 url_or_request, video_id, note, errnote, fatal=fatal,
959 encoding=encoding, data=data, headers=headers, query=query,
960 expected_status=expected_status)
961 if res is False:
962 return res
963 json_string, urlh = res
964 return self._parse_json(
965 json_string, video_id, transform_source=transform_source,
966 fatal=fatal), urlh
967
968 def _download_json(
969 self, url_or_request, video_id, note='Downloading JSON metadata',
970 errnote='Unable to download JSON metadata', transform_source=None,
971 fatal=True, encoding=None, data=None, headers={}, query={},
972 expected_status=None):
973 """
974 Return the JSON object as a dict.
975
976 See _download_webpage docstring for arguments specification.
977 """
978 res = self._download_json_handle(
979 url_or_request, video_id, note=note, errnote=errnote,
980 transform_source=transform_source, fatal=fatal, encoding=encoding,
981 data=data, headers=headers, query=query,
982 expected_status=expected_status)
983 return res if res is False else res[0]
984
985 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
986 if transform_source:
987 json_string = transform_source(json_string)
988 try:
989 return json.loads(json_string)
990 except ValueError as ve:
991 errmsg = '%s: Failed to parse JSON ' % video_id
992 if fatal:
993 raise ExtractorError(errmsg, cause=ve)
994 else:
995 self.report_warning(errmsg + str(ve))
996
997 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
998 return self._parse_json(
999 data[data.find('{'):data.rfind('}') + 1],
1000 video_id, transform_source, fatal)
1001
1002 def _download_socket_json_handle(
1003 self, url_or_request, video_id, note='Polling socket',
1004 errnote='Unable to poll socket', transform_source=None,
1005 fatal=True, encoding=None, data=None, headers={}, query={},
1006 expected_status=None):
1007 """
1008 Return a tuple (JSON object, URL handle).
1009
1010 See _download_webpage docstring for arguments specification.
1011 """
1012 res = self._download_webpage_handle(
1013 url_or_request, video_id, note, errnote, fatal=fatal,
1014 encoding=encoding, data=data, headers=headers, query=query,
1015 expected_status=expected_status)
1016 if res is False:
1017 return res
1018 webpage, urlh = res
1019 return self._parse_socket_response_as_json(
1020 webpage, video_id, transform_source=transform_source,
1021 fatal=fatal), urlh
1022
1023 def _download_socket_json(
1024 self, url_or_request, video_id, note='Polling socket',
1025 errnote='Unable to poll socket', transform_source=None,
1026 fatal=True, encoding=None, data=None, headers={}, query={},
1027 expected_status=None):
1028 """
1029 Return the JSON object as a dict.
1030
1031 See _download_webpage docstring for arguments specification.
1032 """
1033 res = self._download_socket_json_handle(
1034 url_or_request, video_id, note=note, errnote=errnote,
1035 transform_source=transform_source, fatal=fatal, encoding=encoding,
1036 data=data, headers=headers, query=query,
1037 expected_status=expected_status)
1038 return res if res is False else res[0]
1039
1040 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1041 idstr = format_field(video_id, template='%s: ')
1042 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1043 if only_once:
1044 if f'WARNING: {msg}' in self._printed_messages:
1045 return
1046 self._printed_messages.add(f'WARNING: {msg}')
1047 self._downloader.report_warning(msg, *args, **kwargs)
1048
1049 def to_screen(self, msg, *args, **kwargs):
1050 """Print msg to screen, prefixing it with '[ie_name]'"""
1051 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1052
1053 def write_debug(self, msg, *args, **kwargs):
1054 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1055
1056 def get_param(self, name, default=None, *args, **kwargs):
1057 if self._downloader:
1058 return self._downloader.params.get(name, default, *args, **kwargs)
1059 return default
1060
1061 def report_drm(self, video_id, partial=False):
1062 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1063
1064 def report_extraction(self, id_or_name):
1065 """Report information extraction."""
1066 self.to_screen('%s: Extracting information' % id_or_name)
1067
1068 def report_download_webpage(self, video_id):
1069 """Report webpage download."""
1070 self.to_screen('%s: Downloading webpage' % video_id)
1071
1072 def report_age_confirmation(self):
1073 """Report attempt to confirm age."""
1074 self.to_screen('Confirming age')
1075
1076 def report_login(self):
1077 """Report attempt to log in."""
1078 self.to_screen('Logging in')
1079
1080 def raise_login_required(
1081 self, msg='This video is only available for registered users',
1082 metadata_available=False, method='any'):
1083 if metadata_available and (
1084 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1085 self.report_warning(msg)
1086 if method is not None:
1087 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1088 raise ExtractorError(msg, expected=True)
1089
1090 def raise_geo_restricted(
1091 self, msg='This video is not available from your location due to geo restriction',
1092 countries=None, metadata_available=False):
1093 if metadata_available and (
1094 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1095 self.report_warning(msg)
1096 else:
1097 raise GeoRestrictedError(msg, countries=countries)
1098
1099 def raise_no_formats(self, msg, expected=False, video_id=None):
1100 if expected and (
1101 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1102 self.report_warning(msg, video_id)
1103 elif isinstance(msg, ExtractorError):
1104 raise msg
1105 else:
1106 raise ExtractorError(msg, expected=expected, video_id=video_id)
1107
1108 # Methods for following #608
1109 @staticmethod
1110 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1111 """Returns a URL that points to a page that should be processed"""
1112 # TODO: ie should be the class used for getting the info
1113 video_info = {'_type': 'url',
1114 'url': url,
1115 'ie_key': ie}
1116 video_info.update(kwargs)
1117 if video_id is not None:
1118 video_info['id'] = video_id
1119 if video_title is not None:
1120 video_info['title'] = video_title
1121 return video_info
1122
1123 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1124 urls = orderedSet(
1125 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1126 for m in matches)
1127 return self.playlist_result(
1128 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1129
1130 @staticmethod
1131 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1132 """Returns a playlist"""
1133 video_info = {'_type': 'playlist',
1134 'entries': entries}
1135 video_info.update(kwargs)
1136 if playlist_id:
1137 video_info['id'] = playlist_id
1138 if playlist_title:
1139 video_info['title'] = playlist_title
1140 if playlist_description is not None:
1141 video_info['description'] = playlist_description
1142 return video_info
1143
1144 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1145 """
1146 Perform a regex search on the given string, using a single or a list of
1147 patterns returning the first matching group.
1148 In case of failure return a default value or raise a WARNING or a
1149 RegexNotFoundError, depending on fatal, specifying the field name.
1150 """
1151 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1152 mobj = re.search(pattern, string, flags)
1153 else:
1154 for p in pattern:
1155 mobj = re.search(p, string, flags)
1156 if mobj:
1157 break
1158
1159 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1160
1161 if mobj:
1162 if group is None:
1163 # return the first matching group
1164 return next(g for g in mobj.groups() if g is not None)
1165 elif isinstance(group, (list, tuple)):
1166 return tuple(mobj.group(g) for g in group)
1167 else:
1168 return mobj.group(group)
1169 elif default is not NO_DEFAULT:
1170 return default
1171 elif fatal:
1172 raise RegexNotFoundError('Unable to extract %s' % _name)
1173 else:
1174 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1175 return None
1176
1177 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1178 """
1179 Like _search_regex, but strips HTML tags and unescapes entities.
1180 """
1181 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1182 if res:
1183 return clean_html(res).strip()
1184 else:
1185 return res
1186
1187 def _get_netrc_login_info(self, netrc_machine=None):
1188 username = None
1189 password = None
1190 netrc_machine = netrc_machine or self._NETRC_MACHINE
1191
1192 if self.get_param('usenetrc', False):
1193 try:
1194 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1195 if os.path.isdir(netrc_file):
1196 netrc_file = os.path.join(netrc_file, '.netrc')
1197 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1198 if info is not None:
1199 username = info[0]
1200 password = info[2]
1201 else:
1202 raise netrc.NetrcParseError(
1203 'No authenticators for %s' % netrc_machine)
1204 except (IOError, netrc.NetrcParseError) as err:
1205 self.report_warning(
1206 'parsing .netrc: %s' % error_to_compat_str(err))
1207
1208 return username, password
1209
1210 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1211 """
1212 Get the login info as (username, password)
1213 First look for the manually specified credentials using username_option
1214 and password_option as keys in params dictionary. If no such credentials
1215 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1216 value.
1217 If there's no info available, return (None, None)
1218 """
1219
1220 # Attempt to use provided username and password or .netrc data
1221 username = self.get_param(username_option)
1222 if username is not None:
1223 password = self.get_param(password_option)
1224 else:
1225 username, password = self._get_netrc_login_info(netrc_machine)
1226
1227 return username, password
1228
1229 def _get_tfa_info(self, note='two-factor verification code'):
1230 """
1231 Get the two-factor authentication info
1232 TODO - asking the user will be required for sms/phone verify
1233 currently just uses the command line option
1234 If there's no info available, return None
1235 """
1236
1237 tfa = self.get_param('twofactor')
1238 if tfa is not None:
1239 return tfa
1240
1241 return compat_getpass('Type %s and press [Return]: ' % note)
1242
1243 # Helper functions for extracting OpenGraph info
1244 @staticmethod
1245 def _og_regexes(prop):
1246 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1247 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1248 % {'prop': re.escape(prop)})
1249 template = r'<meta[^>]+?%s[^>]+?%s'
1250 return [
1251 template % (property_re, content_re),
1252 template % (content_re, property_re),
1253 ]
1254
1255 @staticmethod
1256 def _meta_regex(prop):
1257 return r'''(?isx)<meta
1258 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1259 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1260
1261 def _og_search_property(self, prop, html, name=None, **kargs):
1262 prop = variadic(prop)
1263 if name is None:
1264 name = 'OpenGraph %s' % prop[0]
1265 og_regexes = []
1266 for p in prop:
1267 og_regexes.extend(self._og_regexes(p))
1268 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1269 if escaped is None:
1270 return None
1271 return unescapeHTML(escaped)
1272
1273 def _og_search_thumbnail(self, html, **kargs):
1274 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1275
1276 def _og_search_description(self, html, **kargs):
1277 return self._og_search_property('description', html, fatal=False, **kargs)
1278
1279 def _og_search_title(self, html, **kargs):
1280 return self._og_search_property('title', html, **kargs)
1281
1282 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1283 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1284 if secure:
1285 regexes = self._og_regexes('video:secure_url') + regexes
1286 return self._html_search_regex(regexes, html, name, **kargs)
1287
1288 def _og_search_url(self, html, **kargs):
1289 return self._og_search_property('url', html, **kargs)
1290
1291 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1292 name = variadic(name)
1293 if display_name is None:
1294 display_name = name[0]
1295 return self._html_search_regex(
1296 [self._meta_regex(n) for n in name],
1297 html, display_name, fatal=fatal, group='content', **kwargs)
1298
1299 def _dc_search_uploader(self, html):
1300 return self._html_search_meta('dc.creator', html, 'uploader')
1301
1302 def _rta_search(self, html):
1303 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1304 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1305 r' content="RTA-5042-1996-1400-1577-RTA"',
1306 html):
1307 return 18
1308 return 0
1309
1310 def _media_rating_search(self, html):
1311 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1312 rating = self._html_search_meta('rating', html)
1313
1314 if not rating:
1315 return None
1316
1317 RATING_TABLE = {
1318 'safe for kids': 0,
1319 'general': 8,
1320 '14 years': 14,
1321 'mature': 17,
1322 'restricted': 19,
1323 }
1324 return RATING_TABLE.get(rating.lower())
1325
1326 def _family_friendly_search(self, html):
1327 # See http://schema.org/VideoObject
1328 family_friendly = self._html_search_meta(
1329 'isFamilyFriendly', html, default=None)
1330
1331 if not family_friendly:
1332 return None
1333
1334 RATING_TABLE = {
1335 '1': 0,
1336 'true': 0,
1337 '0': 18,
1338 'false': 18,
1339 }
1340 return RATING_TABLE.get(family_friendly.lower())
1341
1342 def _twitter_search_player(self, html):
1343 return self._html_search_meta('twitter:player', html,
1344 'twitter card player')
1345
1346 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1347 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1348 default = kwargs.get('default', NO_DEFAULT)
1349 # JSON-LD may be malformed and thus `fatal` should be respected.
1350 # At the same time `default` may be passed that assumes `fatal=False`
1351 # for _search_regex. Let's simulate the same behavior here as well.
1352 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1353 json_ld = []
1354 for mobj in json_ld_list:
1355 json_ld_item = self._parse_json(
1356 mobj.group('json_ld'), video_id, fatal=fatal)
1357 if not json_ld_item:
1358 continue
1359 if isinstance(json_ld_item, dict):
1360 json_ld.append(json_ld_item)
1361 elif isinstance(json_ld_item, (list, tuple)):
1362 json_ld.extend(json_ld_item)
1363 if json_ld:
1364 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1365 if json_ld:
1366 return json_ld
1367 if default is not NO_DEFAULT:
1368 return default
1369 elif fatal:
1370 raise RegexNotFoundError('Unable to extract JSON-LD')
1371 else:
1372 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1373 return {}
1374
1375 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1376 if isinstance(json_ld, compat_str):
1377 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1378 if not json_ld:
1379 return {}
1380 info = {}
1381 if not isinstance(json_ld, (list, tuple, dict)):
1382 return info
1383 if isinstance(json_ld, dict):
1384 json_ld = [json_ld]
1385
1386 INTERACTION_TYPE_MAP = {
1387 'CommentAction': 'comment',
1388 'AgreeAction': 'like',
1389 'DisagreeAction': 'dislike',
1390 'LikeAction': 'like',
1391 'DislikeAction': 'dislike',
1392 'ListenAction': 'view',
1393 'WatchAction': 'view',
1394 'ViewAction': 'view',
1395 }
1396
1397 def extract_interaction_type(e):
1398 interaction_type = e.get('interactionType')
1399 if isinstance(interaction_type, dict):
1400 interaction_type = interaction_type.get('@type')
1401 return str_or_none(interaction_type)
1402
1403 def extract_interaction_statistic(e):
1404 interaction_statistic = e.get('interactionStatistic')
1405 if isinstance(interaction_statistic, dict):
1406 interaction_statistic = [interaction_statistic]
1407 if not isinstance(interaction_statistic, list):
1408 return
1409 for is_e in interaction_statistic:
1410 if not isinstance(is_e, dict):
1411 continue
1412 if is_e.get('@type') != 'InteractionCounter':
1413 continue
1414 interaction_type = extract_interaction_type(is_e)
1415 if not interaction_type:
1416 continue
1417 # For interaction count some sites provide string instead of
1418 # an integer (as per spec) with non digit characters (e.g. ",")
1419 # so extracting count with more relaxed str_to_int
1420 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1421 if interaction_count is None:
1422 continue
1423 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1424 if not count_kind:
1425 continue
1426 count_key = '%s_count' % count_kind
1427 if info.get(count_key) is not None:
1428 continue
1429 info[count_key] = interaction_count
1430
1431 def extract_video_object(e):
1432 assert e['@type'] == 'VideoObject'
1433 author = e.get('author')
1434 info.update({
1435 'url': url_or_none(e.get('contentUrl')),
1436 'title': unescapeHTML(e.get('name')),
1437 'description': unescapeHTML(e.get('description')),
1438 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1439 'duration': parse_duration(e.get('duration')),
1440 'timestamp': unified_timestamp(e.get('uploadDate')),
1441 # author can be an instance of 'Organization' or 'Person' types.
1442 # both types can have 'name' property(inherited from 'Thing' type). [1]
1443 # however some websites are using 'Text' type instead.
1444 # 1. https://schema.org/VideoObject
1445 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1446 'filesize': float_or_none(e.get('contentSize')),
1447 'tbr': int_or_none(e.get('bitrate')),
1448 'width': int_or_none(e.get('width')),
1449 'height': int_or_none(e.get('height')),
1450 'view_count': int_or_none(e.get('interactionCount')),
1451 })
1452 extract_interaction_statistic(e)
1453
1454 def traverse_json_ld(json_ld, at_top_level=True):
1455 for e in json_ld:
1456 if at_top_level and '@context' not in e:
1457 continue
1458 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1459 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1460 break
1461 item_type = e.get('@type')
1462 if expected_type is not None and expected_type != item_type:
1463 continue
1464 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1465 if rating is not None:
1466 info['average_rating'] = rating
1467 if item_type in ('TVEpisode', 'Episode'):
1468 episode_name = unescapeHTML(e.get('name'))
1469 info.update({
1470 'episode': episode_name,
1471 'episode_number': int_or_none(e.get('episodeNumber')),
1472 'description': unescapeHTML(e.get('description')),
1473 })
1474 if not info.get('title') and episode_name:
1475 info['title'] = episode_name
1476 part_of_season = e.get('partOfSeason')
1477 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1478 info.update({
1479 'season': unescapeHTML(part_of_season.get('name')),
1480 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1481 })
1482 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1483 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1484 info['series'] = unescapeHTML(part_of_series.get('name'))
1485 elif item_type == 'Movie':
1486 info.update({
1487 'title': unescapeHTML(e.get('name')),
1488 'description': unescapeHTML(e.get('description')),
1489 'duration': parse_duration(e.get('duration')),
1490 'timestamp': unified_timestamp(e.get('dateCreated')),
1491 })
1492 elif item_type in ('Article', 'NewsArticle'):
1493 info.update({
1494 'timestamp': parse_iso8601(e.get('datePublished')),
1495 'title': unescapeHTML(e.get('headline')),
1496 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1497 })
1498 elif item_type == 'VideoObject':
1499 extract_video_object(e)
1500 if expected_type is None:
1501 continue
1502 else:
1503 break
1504 video = e.get('video')
1505 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1506 extract_video_object(video)
1507 if expected_type is None:
1508 continue
1509 else:
1510 break
1511 traverse_json_ld(json_ld)
1512
1513 return dict((k, v) for k, v in info.items() if v is not None)
1514
1515 def _search_nextjs_data(self, webpage, video_id, **kw):
1516 return self._parse_json(
1517 self._search_regex(
1518 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1519 webpage, 'next.js data', **kw),
1520 video_id, **kw)
1521
1522 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1523 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1524 # not all website do this, but it can be changed
1525 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1526 rectx = re.escape(context_name)
1527 js, arg_keys, arg_vals = self._search_regex(
1528 (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1529 r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1530 webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1531
1532 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1533
1534 for key, val in args.items():
1535 if val in ('undefined', 'void 0'):
1536 args[key] = 'null'
1537
1538 return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1539
1540 @staticmethod
1541 def _hidden_inputs(html):
1542 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1543 hidden_inputs = {}
1544 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1545 attrs = extract_attributes(input)
1546 if not input:
1547 continue
1548 if attrs.get('type') not in ('hidden', 'submit'):
1549 continue
1550 name = attrs.get('name') or attrs.get('id')
1551 value = attrs.get('value')
1552 if name and value is not None:
1553 hidden_inputs[name] = value
1554 return hidden_inputs
1555
1556 def _form_hidden_inputs(self, form_id, html):
1557 form = self._search_regex(
1558 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1559 html, '%s form' % form_id, group='form')
1560 return self._hidden_inputs(form)
1561
1562 class FormatSort:
1563 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1564
1565 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1566 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1567 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1568 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1569 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1570 'fps', 'fs_approx', 'source', 'id')
1571
1572 settings = {
1573 'vcodec': {'type': 'ordered', 'regex': True,
1574 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1575 'acodec': {'type': 'ordered', 'regex': True,
1576 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1577 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1578 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1579 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1580 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1581 'vext': {'type': 'ordered', 'field': 'video_ext',
1582 'order': ('mp4', 'webm', 'flv', '', 'none'),
1583 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1584 'aext': {'type': 'ordered', 'field': 'audio_ext',
1585 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1586 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1587 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1588 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1589 'field': ('vcodec', 'acodec'),
1590 'function': lambda it: int(any(v != 'none' for v in it))},
1591 'ie_pref': {'priority': True, 'type': 'extractor'},
1592 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1593 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1594 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1595 'quality': {'convert': 'float', 'default': -1},
1596 'filesize': {'convert': 'bytes'},
1597 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1598 'id': {'convert': 'string', 'field': 'format_id'},
1599 'height': {'convert': 'float_none'},
1600 'width': {'convert': 'float_none'},
1601 'fps': {'convert': 'float_none'},
1602 'tbr': {'convert': 'float_none'},
1603 'vbr': {'convert': 'float_none'},
1604 'abr': {'convert': 'float_none'},
1605 'asr': {'convert': 'float_none'},
1606 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1607
1608 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1609 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1610 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1611 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1612 'res': {'type': 'multiple', 'field': ('height', 'width'),
1613 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1614
1615 # For compatibility with youtube-dl
1616 'format_id': {'type': 'alias', 'field': 'id'},
1617 'preference': {'type': 'alias', 'field': 'ie_pref'},
1618 'language_preference': {'type': 'alias', 'field': 'lang'},
1619
1620 # Deprecated
1621 'dimension': {'type': 'alias', 'field': 'res'},
1622 'resolution': {'type': 'alias', 'field': 'res'},
1623 'extension': {'type': 'alias', 'field': 'ext'},
1624 'bitrate': {'type': 'alias', 'field': 'br'},
1625 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1626 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1627 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1628 'framerate': {'type': 'alias', 'field': 'fps'},
1629 'protocol': {'type': 'alias', 'field': 'proto'},
1630 'source_preference': {'type': 'alias', 'field': 'source'},
1631 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1632 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1633 'samplerate': {'type': 'alias', 'field': 'asr'},
1634 'video_ext': {'type': 'alias', 'field': 'vext'},
1635 'audio_ext': {'type': 'alias', 'field': 'aext'},
1636 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1637 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1638 'video': {'type': 'alias', 'field': 'hasvid'},
1639 'has_video': {'type': 'alias', 'field': 'hasvid'},
1640 'audio': {'type': 'alias', 'field': 'hasaud'},
1641 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1642 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1643 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1644 }
1645
1646 def __init__(self, ie, field_preference):
1647 self._order = []
1648 self.ydl = ie._downloader
1649 self.evaluate_params(self.ydl.params, field_preference)
1650 if ie.get_param('verbose'):
1651 self.print_verbose_info(self.ydl.write_debug)
1652
1653 def _get_field_setting(self, field, key):
1654 if field not in self.settings:
1655 if key in ('forced', 'priority'):
1656 return False
1657 self.ydl.deprecation_warning(
1658 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1659 'and may be removed in a future version')
1660 self.settings[field] = {}
1661 propObj = self.settings[field]
1662 if key not in propObj:
1663 type = propObj.get('type')
1664 if key == 'field':
1665 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1666 elif key == 'convert':
1667 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1668 else:
1669 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1670 propObj[key] = default
1671 return propObj[key]
1672
1673 def _resolve_field_value(self, field, value, convertNone=False):
1674 if value is None:
1675 if not convertNone:
1676 return None
1677 else:
1678 value = value.lower()
1679 conversion = self._get_field_setting(field, 'convert')
1680 if conversion == 'ignore':
1681 return None
1682 if conversion == 'string':
1683 return value
1684 elif conversion == 'float_none':
1685 return float_or_none(value)
1686 elif conversion == 'bytes':
1687 return FileDownloader.parse_bytes(value)
1688 elif conversion == 'order':
1689 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1690 use_regex = self._get_field_setting(field, 'regex')
1691 list_length = len(order_list)
1692 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1693 if use_regex and value is not None:
1694 for i, regex in enumerate(order_list):
1695 if regex and re.match(regex, value):
1696 return list_length - i
1697 return list_length - empty_pos # not in list
1698 else: # not regex or value = None
1699 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1700 else:
1701 if value.isnumeric():
1702 return float(value)
1703 else:
1704 self.settings[field]['convert'] = 'string'
1705 return value
1706
1707 def evaluate_params(self, params, sort_extractor):
1708 self._use_free_order = params.get('prefer_free_formats', False)
1709 self._sort_user = params.get('format_sort', [])
1710 self._sort_extractor = sort_extractor
1711
1712 def add_item(field, reverse, closest, limit_text):
1713 field = field.lower()
1714 if field in self._order:
1715 return
1716 self._order.append(field)
1717 limit = self._resolve_field_value(field, limit_text)
1718 data = {
1719 'reverse': reverse,
1720 'closest': False if limit is None else closest,
1721 'limit_text': limit_text,
1722 'limit': limit}
1723 if field in self.settings:
1724 self.settings[field].update(data)
1725 else:
1726 self.settings[field] = data
1727
1728 sort_list = (
1729 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1730 + (tuple() if params.get('format_sort_force', False)
1731 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1732 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1733
1734 for item in sort_list:
1735 match = re.match(self.regex, item)
1736 if match is None:
1737 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1738 field = match.group('field')
1739 if field is None:
1740 continue
1741 if self._get_field_setting(field, 'type') == 'alias':
1742 alias, field = field, self._get_field_setting(field, 'field')
1743 if alias not in ('format_id', 'preference', 'language_preference'):
1744 self.ydl.deprecation_warning(
1745 f'Format sorting alias {alias} is deprecated '
1746 f'and may be removed in a future version. Please use {field} instead')
1747 reverse = match.group('reverse') is not None
1748 closest = match.group('separator') == '~'
1749 limit_text = match.group('limit')
1750
1751 has_limit = limit_text is not None
1752 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1753 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1754
1755 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1756 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1757 limit_count = len(limits)
1758 for (i, f) in enumerate(fields):
1759 add_item(f, reverse, closest,
1760 limits[i] if i < limit_count
1761 else limits[0] if has_limit and not has_multiple_limits
1762 else None)
1763
1764 def print_verbose_info(self, write_debug):
1765 if self._sort_user:
1766 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1767 if self._sort_extractor:
1768 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1769 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1770 '+' if self._get_field_setting(field, 'reverse') else '', field,
1771 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1772 self._get_field_setting(field, 'limit_text'),
1773 self._get_field_setting(field, 'limit'))
1774 if self._get_field_setting(field, 'limit_text') is not None else '')
1775 for field in self._order if self._get_field_setting(field, 'visible')]))
1776
1777 def _calculate_field_preference_from_value(self, format, field, type, value):
1778 reverse = self._get_field_setting(field, 'reverse')
1779 closest = self._get_field_setting(field, 'closest')
1780 limit = self._get_field_setting(field, 'limit')
1781
1782 if type == 'extractor':
1783 maximum = self._get_field_setting(field, 'max')
1784 if value is None or (maximum is not None and value >= maximum):
1785 value = -1
1786 elif type == 'boolean':
1787 in_list = self._get_field_setting(field, 'in_list')
1788 not_in_list = self._get_field_setting(field, 'not_in_list')
1789 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1790 elif type == 'ordered':
1791 value = self._resolve_field_value(field, value, True)
1792
1793 # try to convert to number
1794 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1795 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1796 if is_num:
1797 value = val_num
1798
1799 return ((-10, 0) if value is None
1800 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1801 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1802 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1803 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1804 else (-1, value, 0))
1805
1806 def _calculate_field_preference(self, format, field):
1807 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1808 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1809 if type == 'multiple':
1810 type = 'field' # Only 'field' is allowed in multiple for now
1811 actual_fields = self._get_field_setting(field, 'field')
1812
1813 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1814 else:
1815 value = get_value(field)
1816 return self._calculate_field_preference_from_value(format, field, type, value)
1817
1818 def calculate_preference(self, format):
1819 # Determine missing protocol
1820 if not format.get('protocol'):
1821 format['protocol'] = determine_protocol(format)
1822
1823 # Determine missing ext
1824 if not format.get('ext') and 'url' in format:
1825 format['ext'] = determine_ext(format['url'])
1826 if format.get('vcodec') == 'none':
1827 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1828 format['video_ext'] = 'none'
1829 else:
1830 format['video_ext'] = format['ext']
1831 format['audio_ext'] = 'none'
1832 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1833 # format['preference'] = -1000
1834
1835 # Determine missing bitrates
1836 if format.get('tbr') is None:
1837 if format.get('vbr') is not None and format.get('abr') is not None:
1838 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1839 else:
1840 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1841 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1842 if format.get('acodec') != 'none' and format.get('abr') is None:
1843 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1844
1845 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1846
1847 def _sort_formats(self, formats, field_preference=[]):
1848 if not formats:
1849 return
1850 format_sort = self.FormatSort(self, field_preference)
1851 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1852
1853 def _check_formats(self, formats, video_id):
1854 if formats:
1855 formats[:] = filter(
1856 lambda f: self._is_valid_url(
1857 f['url'], video_id,
1858 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1859 formats)
1860
1861 @staticmethod
1862 def _remove_duplicate_formats(formats):
1863 format_urls = set()
1864 unique_formats = []
1865 for f in formats:
1866 if f['url'] not in format_urls:
1867 format_urls.add(f['url'])
1868 unique_formats.append(f)
1869 formats[:] = unique_formats
1870
1871 def _is_valid_url(self, url, video_id, item='video', headers={}):
1872 url = self._proto_relative_url(url, scheme='http:')
1873 # For now assume non HTTP(S) URLs always valid
1874 if not (url.startswith('http://') or url.startswith('https://')):
1875 return True
1876 try:
1877 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1878 return True
1879 except ExtractorError as e:
1880 self.to_screen(
1881 '%s: %s URL is invalid, skipping: %s'
1882 % (video_id, item, error_to_compat_str(e.cause)))
1883 return False
1884
1885 def http_scheme(self):
1886 """ Either "http:" or "https:", depending on the user's preferences """
1887 return (
1888 'http:'
1889 if self.get_param('prefer_insecure', False)
1890 else 'https:')
1891
1892 def _proto_relative_url(self, url, scheme=None):
1893 if url is None:
1894 return url
1895 if url.startswith('//'):
1896 if scheme is None:
1897 scheme = self.http_scheme()
1898 return scheme + url
1899 else:
1900 return url
1901
1902 def _sleep(self, timeout, video_id, msg_template=None):
1903 if msg_template is None:
1904 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1905 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1906 self.to_screen(msg)
1907 time.sleep(timeout)
1908
1909 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1910 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1911 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1912 manifest = self._download_xml(
1913 manifest_url, video_id, 'Downloading f4m manifest',
1914 'Unable to download f4m manifest',
1915 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1916 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1917 transform_source=transform_source,
1918 fatal=fatal, data=data, headers=headers, query=query)
1919
1920 if manifest is False:
1921 return []
1922
1923 return self._parse_f4m_formats(
1924 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1925 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1926
1927 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1928 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1929 fatal=True, m3u8_id=None):
1930 if not isinstance(manifest, compat_etree_Element) and not fatal:
1931 return []
1932
1933 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1934 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1935 if akamai_pv is not None and ';' in akamai_pv.text:
1936 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1937 if playerVerificationChallenge.strip() != '':
1938 return []
1939
1940 formats = []
1941 manifest_version = '1.0'
1942 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1943 if not media_nodes:
1944 manifest_version = '2.0'
1945 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1946 # Remove unsupported DRM protected media from final formats
1947 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1948 media_nodes = remove_encrypted_media(media_nodes)
1949 if not media_nodes:
1950 return formats
1951
1952 manifest_base_url = get_base_url(manifest)
1953
1954 bootstrap_info = xpath_element(
1955 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1956 'bootstrap info', default=None)
1957
1958 vcodec = None
1959 mime_type = xpath_text(
1960 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1961 'base URL', default=None)
1962 if mime_type and mime_type.startswith('audio/'):
1963 vcodec = 'none'
1964
1965 for i, media_el in enumerate(media_nodes):
1966 tbr = int_or_none(media_el.attrib.get('bitrate'))
1967 width = int_or_none(media_el.attrib.get('width'))
1968 height = int_or_none(media_el.attrib.get('height'))
1969 format_id = join_nonempty(f4m_id, tbr or i)
1970 # If <bootstrapInfo> is present, the specified f4m is a
1971 # stream-level manifest, and only set-level manifests may refer to
1972 # external resources. See section 11.4 and section 4 of F4M spec
1973 if bootstrap_info is None:
1974 media_url = None
1975 # @href is introduced in 2.0, see section 11.6 of F4M spec
1976 if manifest_version == '2.0':
1977 media_url = media_el.attrib.get('href')
1978 if media_url is None:
1979 media_url = media_el.attrib.get('url')
1980 if not media_url:
1981 continue
1982 manifest_url = (
1983 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1984 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1985 # If media_url is itself a f4m manifest do the recursive extraction
1986 # since bitrates in parent manifest (this one) and media_url manifest
1987 # may differ leading to inability to resolve the format by requested
1988 # bitrate in f4m downloader
1989 ext = determine_ext(manifest_url)
1990 if ext == 'f4m':
1991 f4m_formats = self._extract_f4m_formats(
1992 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1993 transform_source=transform_source, fatal=fatal)
1994 # Sometimes stream-level manifest contains single media entry that
1995 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1996 # At the same time parent's media entry in set-level manifest may
1997 # contain it. We will copy it from parent in such cases.
1998 if len(f4m_formats) == 1:
1999 f = f4m_formats[0]
2000 f.update({
2001 'tbr': f.get('tbr') or tbr,
2002 'width': f.get('width') or width,
2003 'height': f.get('height') or height,
2004 'format_id': f.get('format_id') if not tbr else format_id,
2005 'vcodec': vcodec,
2006 })
2007 formats.extend(f4m_formats)
2008 continue
2009 elif ext == 'm3u8':
2010 formats.extend(self._extract_m3u8_formats(
2011 manifest_url, video_id, 'mp4', preference=preference,
2012 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2013 continue
2014 formats.append({
2015 'format_id': format_id,
2016 'url': manifest_url,
2017 'manifest_url': manifest_url,
2018 'ext': 'flv' if bootstrap_info is not None else None,
2019 'protocol': 'f4m',
2020 'tbr': tbr,
2021 'width': width,
2022 'height': height,
2023 'vcodec': vcodec,
2024 'preference': preference,
2025 'quality': quality,
2026 })
2027 return formats
2028
2029 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2030 return {
2031 'format_id': join_nonempty(m3u8_id, 'meta'),
2032 'url': m3u8_url,
2033 'ext': ext,
2034 'protocol': 'm3u8',
2035 'preference': preference - 100 if preference else -100,
2036 'quality': quality,
2037 'resolution': 'multiple',
2038 'format_note': 'Quality selection URL',
2039 }
2040
2041 def _report_ignoring_subs(self, name):
2042 self.report_warning(bug_reports_message(
2043 f'Ignoring subtitle tracks found in the {name} manifest; '
2044 'if any subtitle tracks are missing,'
2045 ), only_once=True)
2046
2047 def _extract_m3u8_formats(self, *args, **kwargs):
2048 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2049 if subs:
2050 self._report_ignoring_subs('HLS')
2051 return fmts
2052
2053 def _extract_m3u8_formats_and_subtitles(
2054 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2055 preference=None, quality=None, m3u8_id=None, note=None,
2056 errnote=None, fatal=True, live=False, data=None, headers={},
2057 query={}):
2058
2059 res = self._download_webpage_handle(
2060 m3u8_url, video_id,
2061 note='Downloading m3u8 information' if note is None else note,
2062 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2063 fatal=fatal, data=data, headers=headers, query=query)
2064
2065 if res is False:
2066 return [], {}
2067
2068 m3u8_doc, urlh = res
2069 m3u8_url = urlh.geturl()
2070
2071 return self._parse_m3u8_formats_and_subtitles(
2072 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2073 preference=preference, quality=quality, m3u8_id=m3u8_id,
2074 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2075 headers=headers, query=query, video_id=video_id)
2076
2077 def _parse_m3u8_formats_and_subtitles(
2078 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2079 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2080 errnote=None, fatal=True, data=None, headers={}, query={},
2081 video_id=None):
2082 formats, subtitles = [], {}
2083
2084 has_drm = re.search('|'.join([
2085 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2086 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2087 ]), m3u8_doc)
2088
2089 def format_url(url):
2090 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2091
2092 if self.get_param('hls_split_discontinuity', False):
2093 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2094 if not m3u8_doc:
2095 if not manifest_url:
2096 return []
2097 m3u8_doc = self._download_webpage(
2098 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2099 note=False, errnote='Failed to download m3u8 playlist information')
2100 if m3u8_doc is False:
2101 return []
2102 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2103
2104 else:
2105 def _extract_m3u8_playlist_indices(*args, **kwargs):
2106 return [None]
2107
2108 # References:
2109 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2110 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2111 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2112
2113 # We should try extracting formats only from master playlists [1, 4.3.4],
2114 # i.e. playlists that describe available qualities. On the other hand
2115 # media playlists [1, 4.3.3] should be returned as is since they contain
2116 # just the media without qualities renditions.
2117 # Fortunately, master playlist can be easily distinguished from media
2118 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2119 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2120 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2121 # media playlist and MUST NOT appear in master playlist thus we can
2122 # clearly detect media playlist with this criterion.
2123
2124 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2125 formats = [{
2126 'format_id': join_nonempty(m3u8_id, idx),
2127 'format_index': idx,
2128 'url': m3u8_url,
2129 'ext': ext,
2130 'protocol': entry_protocol,
2131 'preference': preference,
2132 'quality': quality,
2133 'has_drm': has_drm,
2134 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2135
2136 return formats, subtitles
2137
2138 groups = {}
2139 last_stream_inf = {}
2140
2141 def extract_media(x_media_line):
2142 media = parse_m3u8_attributes(x_media_line)
2143 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2144 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2145 if not (media_type and group_id and name):
2146 return
2147 groups.setdefault(group_id, []).append(media)
2148 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2149 if media_type == 'SUBTITLES':
2150 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2151 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2152 # However, lack of URI has been spotted in the wild.
2153 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2154 if not media.get('URI'):
2155 return
2156 url = format_url(media['URI'])
2157 sub_info = {
2158 'url': url,
2159 'ext': determine_ext(url),
2160 }
2161 if sub_info['ext'] == 'm3u8':
2162 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2163 # files may contain is WebVTT:
2164 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2165 sub_info['ext'] = 'vtt'
2166 sub_info['protocol'] = 'm3u8_native'
2167 lang = media.get('LANGUAGE') or 'und'
2168 subtitles.setdefault(lang, []).append(sub_info)
2169 if media_type not in ('VIDEO', 'AUDIO'):
2170 return
2171 media_url = media.get('URI')
2172 if media_url:
2173 manifest_url = format_url(media_url)
2174 formats.extend({
2175 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2176 'format_note': name,
2177 'format_index': idx,
2178 'url': manifest_url,
2179 'manifest_url': m3u8_url,
2180 'language': media.get('LANGUAGE'),
2181 'ext': ext,
2182 'protocol': entry_protocol,
2183 'preference': preference,
2184 'quality': quality,
2185 'vcodec': 'none' if media_type == 'AUDIO' else None,
2186 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2187
2188 def build_stream_name():
2189 # Despite specification does not mention NAME attribute for
2190 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2191 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2192 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2193 stream_name = last_stream_inf.get('NAME')
2194 if stream_name:
2195 return stream_name
2196 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2197 # from corresponding rendition group
2198 stream_group_id = last_stream_inf.get('VIDEO')
2199 if not stream_group_id:
2200 return
2201 stream_group = groups.get(stream_group_id)
2202 if not stream_group:
2203 return stream_group_id
2204 rendition = stream_group[0]
2205 return rendition.get('NAME') or stream_group_id
2206
2207 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2208 # chance to detect video only formats when EXT-X-STREAM-INF tags
2209 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2210 for line in m3u8_doc.splitlines():
2211 if line.startswith('#EXT-X-MEDIA:'):
2212 extract_media(line)
2213
2214 for line in m3u8_doc.splitlines():
2215 if line.startswith('#EXT-X-STREAM-INF:'):
2216 last_stream_inf = parse_m3u8_attributes(line)
2217 elif line.startswith('#') or not line.strip():
2218 continue
2219 else:
2220 tbr = float_or_none(
2221 last_stream_inf.get('AVERAGE-BANDWIDTH')
2222 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2223 manifest_url = format_url(line.strip())
2224
2225 for idx in _extract_m3u8_playlist_indices(manifest_url):
2226 format_id = [m3u8_id, None, idx]
2227 # Bandwidth of live streams may differ over time thus making
2228 # format_id unpredictable. So it's better to keep provided
2229 # format_id intact.
2230 if not live:
2231 stream_name = build_stream_name()
2232 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2233 f = {
2234 'format_id': join_nonempty(*format_id),
2235 'format_index': idx,
2236 'url': manifest_url,
2237 'manifest_url': m3u8_url,
2238 'tbr': tbr,
2239 'ext': ext,
2240 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2241 'protocol': entry_protocol,
2242 'preference': preference,
2243 'quality': quality,
2244 }
2245 resolution = last_stream_inf.get('RESOLUTION')
2246 if resolution:
2247 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2248 if mobj:
2249 f['width'] = int(mobj.group('width'))
2250 f['height'] = int(mobj.group('height'))
2251 # Unified Streaming Platform
2252 mobj = re.search(
2253 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2254 if mobj:
2255 abr, vbr = mobj.groups()
2256 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2257 f.update({
2258 'vbr': vbr,
2259 'abr': abr,
2260 })
2261 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2262 f.update(codecs)
2263 audio_group_id = last_stream_inf.get('AUDIO')
2264 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2265 # references a rendition group MUST have a CODECS attribute.
2266 # However, this is not always respected, for example, [2]
2267 # contains EXT-X-STREAM-INF tag which references AUDIO
2268 # rendition group but does not have CODECS and despite
2269 # referencing an audio group it represents a complete
2270 # (with audio and video) format. So, for such cases we will
2271 # ignore references to rendition groups and treat them
2272 # as complete formats.
2273 if audio_group_id and codecs and f.get('vcodec') != 'none':
2274 audio_group = groups.get(audio_group_id)
2275 if audio_group and audio_group[0].get('URI'):
2276 # TODO: update acodec for audio only formats with
2277 # the same GROUP-ID
2278 f['acodec'] = 'none'
2279 if not f.get('ext'):
2280 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2281 formats.append(f)
2282
2283 # for DailyMotion
2284 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2285 if progressive_uri:
2286 http_f = f.copy()
2287 del http_f['manifest_url']
2288 http_f.update({
2289 'format_id': f['format_id'].replace('hls-', 'http-'),
2290 'protocol': 'http',
2291 'url': progressive_uri,
2292 })
2293 formats.append(http_f)
2294
2295 last_stream_inf = {}
2296 return formats, subtitles
2297
2298 def _extract_m3u8_vod_duration(
2299 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2300
2301 m3u8_vod = self._download_webpage(
2302 m3u8_vod_url, video_id,
2303 note='Downloading m3u8 VOD manifest' if note is None else note,
2304 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2305 fatal=False, data=data, headers=headers, query=query)
2306
2307 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2308
2309 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2310 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2311 return None
2312
2313 return int(sum(
2314 float(line[len('#EXTINF:'):].split(',')[0])
2315 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2316
2317 @staticmethod
2318 def _xpath_ns(path, namespace=None):
2319 if not namespace:
2320 return path
2321 out = []
2322 for c in path.split('/'):
2323 if not c or c == '.':
2324 out.append(c)
2325 else:
2326 out.append('{%s}%s' % (namespace, c))
2327 return '/'.join(out)
2328
2329 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2330 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2331
2332 if smil is False:
2333 assert not fatal
2334 return []
2335
2336 namespace = self._parse_smil_namespace(smil)
2337
2338 fmts = self._parse_smil_formats(
2339 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2340 subs = self._parse_smil_subtitles(
2341 smil, namespace=namespace)
2342
2343 return fmts, subs
2344
2345 def _extract_smil_formats(self, *args, **kwargs):
2346 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2347 if subs:
2348 self._report_ignoring_subs('SMIL')
2349 return fmts
2350
2351 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2352 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2353 if smil is False:
2354 return {}
2355 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2356
2357 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2358 return self._download_xml(
2359 smil_url, video_id, 'Downloading SMIL file',
2360 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2361
2362 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2363 namespace = self._parse_smil_namespace(smil)
2364
2365 formats = self._parse_smil_formats(
2366 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2367 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2368
2369 video_id = os.path.splitext(url_basename(smil_url))[0]
2370 title = None
2371 description = None
2372 upload_date = None
2373 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2374 name = meta.attrib.get('name')
2375 content = meta.attrib.get('content')
2376 if not name or not content:
2377 continue
2378 if not title and name == 'title':
2379 title = content
2380 elif not description and name in ('description', 'abstract'):
2381 description = content
2382 elif not upload_date and name == 'date':
2383 upload_date = unified_strdate(content)
2384
2385 thumbnails = [{
2386 'id': image.get('type'),
2387 'url': image.get('src'),
2388 'width': int_or_none(image.get('width')),
2389 'height': int_or_none(image.get('height')),
2390 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2391
2392 return {
2393 'id': video_id,
2394 'title': title or video_id,
2395 'description': description,
2396 'upload_date': upload_date,
2397 'thumbnails': thumbnails,
2398 'formats': formats,
2399 'subtitles': subtitles,
2400 }
2401
2402 def _parse_smil_namespace(self, smil):
2403 return self._search_regex(
2404 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2405
2406 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2407 base = smil_url
2408 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2409 b = meta.get('base') or meta.get('httpBase')
2410 if b:
2411 base = b
2412 break
2413
2414 formats = []
2415 rtmp_count = 0
2416 http_count = 0
2417 m3u8_count = 0
2418 imgs_count = 0
2419
2420 srcs = set()
2421 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2422 for medium in media:
2423 src = medium.get('src')
2424 if not src or src in srcs:
2425 continue
2426 srcs.add(src)
2427
2428 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2429 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2430 width = int_or_none(medium.get('width'))
2431 height = int_or_none(medium.get('height'))
2432 proto = medium.get('proto')
2433 ext = medium.get('ext')
2434 src_ext = determine_ext(src)
2435 streamer = medium.get('streamer') or base
2436
2437 if proto == 'rtmp' or streamer.startswith('rtmp'):
2438 rtmp_count += 1
2439 formats.append({
2440 'url': streamer,
2441 'play_path': src,
2442 'ext': 'flv',
2443 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2444 'tbr': bitrate,
2445 'filesize': filesize,
2446 'width': width,
2447 'height': height,
2448 })
2449 if transform_rtmp_url:
2450 streamer, src = transform_rtmp_url(streamer, src)
2451 formats[-1].update({
2452 'url': streamer,
2453 'play_path': src,
2454 })
2455 continue
2456
2457 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2458 src_url = src_url.strip()
2459
2460 if proto == 'm3u8' or src_ext == 'm3u8':
2461 m3u8_formats = self._extract_m3u8_formats(
2462 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2463 if len(m3u8_formats) == 1:
2464 m3u8_count += 1
2465 m3u8_formats[0].update({
2466 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2467 'tbr': bitrate,
2468 'width': width,
2469 'height': height,
2470 })
2471 formats.extend(m3u8_formats)
2472 elif src_ext == 'f4m':
2473 f4m_url = src_url
2474 if not f4m_params:
2475 f4m_params = {
2476 'hdcore': '3.2.0',
2477 'plugin': 'flowplayer-3.2.0.1',
2478 }
2479 f4m_url += '&' if '?' in f4m_url else '?'
2480 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2481 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2482 elif src_ext == 'mpd':
2483 formats.extend(self._extract_mpd_formats(
2484 src_url, video_id, mpd_id='dash', fatal=False))
2485 elif re.search(r'\.ism/[Mm]anifest', src_url):
2486 formats.extend(self._extract_ism_formats(
2487 src_url, video_id, ism_id='mss', fatal=False))
2488 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2489 http_count += 1
2490 formats.append({
2491 'url': src_url,
2492 'ext': ext or src_ext or 'flv',
2493 'format_id': 'http-%d' % (bitrate or http_count),
2494 'tbr': bitrate,
2495 'filesize': filesize,
2496 'width': width,
2497 'height': height,
2498 })
2499
2500 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2501 src = medium.get('src')
2502 if not src or src in srcs:
2503 continue
2504 srcs.add(src)
2505
2506 imgs_count += 1
2507 formats.append({
2508 'format_id': 'imagestream-%d' % (imgs_count),
2509 'url': src,
2510 'ext': mimetype2ext(medium.get('type')),
2511 'acodec': 'none',
2512 'vcodec': 'none',
2513 'width': int_or_none(medium.get('width')),
2514 'height': int_or_none(medium.get('height')),
2515 'format_note': 'SMIL storyboards',
2516 })
2517
2518 return formats
2519
2520 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2521 urls = []
2522 subtitles = {}
2523 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2524 src = textstream.get('src')
2525 if not src or src in urls:
2526 continue
2527 urls.append(src)
2528 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2529 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2530 subtitles.setdefault(lang, []).append({
2531 'url': src,
2532 'ext': ext,
2533 })
2534 return subtitles
2535
2536 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2537 xspf = self._download_xml(
2538 xspf_url, playlist_id, 'Downloading xpsf playlist',
2539 'Unable to download xspf manifest', fatal=fatal)
2540 if xspf is False:
2541 return []
2542 return self._parse_xspf(
2543 xspf, playlist_id, xspf_url=xspf_url,
2544 xspf_base_url=base_url(xspf_url))
2545
2546 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2547 NS_MAP = {
2548 'xspf': 'http://xspf.org/ns/0/',
2549 's1': 'http://static.streamone.nl/player/ns/0',
2550 }
2551
2552 entries = []
2553 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2554 title = xpath_text(
2555 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2556 description = xpath_text(
2557 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2558 thumbnail = xpath_text(
2559 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2560 duration = float_or_none(
2561 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2562
2563 formats = []
2564 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2565 format_url = urljoin(xspf_base_url, location.text)
2566 if not format_url:
2567 continue
2568 formats.append({
2569 'url': format_url,
2570 'manifest_url': xspf_url,
2571 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2572 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2573 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2574 })
2575 self._sort_formats(formats)
2576
2577 entries.append({
2578 'id': playlist_id,
2579 'title': title,
2580 'description': description,
2581 'thumbnail': thumbnail,
2582 'duration': duration,
2583 'formats': formats,
2584 })
2585 return entries
2586
2587 def _extract_mpd_formats(self, *args, **kwargs):
2588 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2589 if subs:
2590 self._report_ignoring_subs('DASH')
2591 return fmts
2592
2593 def _extract_mpd_formats_and_subtitles(
2594 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2595 fatal=True, data=None, headers={}, query={}):
2596 res = self._download_xml_handle(
2597 mpd_url, video_id,
2598 note='Downloading MPD manifest' if note is None else note,
2599 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2600 fatal=fatal, data=data, headers=headers, query=query)
2601 if res is False:
2602 return [], {}
2603 mpd_doc, urlh = res
2604 if mpd_doc is None:
2605 return [], {}
2606 mpd_base_url = base_url(urlh.geturl())
2607
2608 return self._parse_mpd_formats_and_subtitles(
2609 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2610
2611 def _parse_mpd_formats(self, *args, **kwargs):
2612 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2613 if subs:
2614 self._report_ignoring_subs('DASH')
2615 return fmts
2616
2617 def _parse_mpd_formats_and_subtitles(
2618 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2619 """
2620 Parse formats from MPD manifest.
2621 References:
2622 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2623 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2624 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2625 """
2626 if not self.get_param('dynamic_mpd', True):
2627 if mpd_doc.get('type') == 'dynamic':
2628 return [], {}
2629
2630 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2631
2632 def _add_ns(path):
2633 return self._xpath_ns(path, namespace)
2634
2635 def is_drm_protected(element):
2636 return element.find(_add_ns('ContentProtection')) is not None
2637
2638 def extract_multisegment_info(element, ms_parent_info):
2639 ms_info = ms_parent_info.copy()
2640
2641 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2642 # common attributes and elements. We will only extract relevant
2643 # for us.
2644 def extract_common(source):
2645 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2646 if segment_timeline is not None:
2647 s_e = segment_timeline.findall(_add_ns('S'))
2648 if s_e:
2649 ms_info['total_number'] = 0
2650 ms_info['s'] = []
2651 for s in s_e:
2652 r = int(s.get('r', 0))
2653 ms_info['total_number'] += 1 + r
2654 ms_info['s'].append({
2655 't': int(s.get('t', 0)),
2656 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2657 'd': int(s.attrib['d']),
2658 'r': r,
2659 })
2660 start_number = source.get('startNumber')
2661 if start_number:
2662 ms_info['start_number'] = int(start_number)
2663 timescale = source.get('timescale')
2664 if timescale:
2665 ms_info['timescale'] = int(timescale)
2666 segment_duration = source.get('duration')
2667 if segment_duration:
2668 ms_info['segment_duration'] = float(segment_duration)
2669
2670 def extract_Initialization(source):
2671 initialization = source.find(_add_ns('Initialization'))
2672 if initialization is not None:
2673 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2674
2675 segment_list = element.find(_add_ns('SegmentList'))
2676 if segment_list is not None:
2677 extract_common(segment_list)
2678 extract_Initialization(segment_list)
2679 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2680 if segment_urls_e:
2681 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2682 else:
2683 segment_template = element.find(_add_ns('SegmentTemplate'))
2684 if segment_template is not None:
2685 extract_common(segment_template)
2686 media = segment_template.get('media')
2687 if media:
2688 ms_info['media'] = media
2689 initialization = segment_template.get('initialization')
2690 if initialization:
2691 ms_info['initialization'] = initialization
2692 else:
2693 extract_Initialization(segment_template)
2694 return ms_info
2695
2696 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2697 formats, subtitles = [], {}
2698 stream_numbers = collections.defaultdict(int)
2699 for period in mpd_doc.findall(_add_ns('Period')):
2700 period_duration = parse_duration(period.get('duration')) or mpd_duration
2701 period_ms_info = extract_multisegment_info(period, {
2702 'start_number': 1,
2703 'timescale': 1,
2704 })
2705 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2706 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2707 for representation in adaptation_set.findall(_add_ns('Representation')):
2708 representation_attrib = adaptation_set.attrib.copy()
2709 representation_attrib.update(representation.attrib)
2710 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2711 mime_type = representation_attrib['mimeType']
2712 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2713
2714 codecs = representation_attrib.get('codecs', '')
2715 if content_type not in ('video', 'audio', 'text'):
2716 if mime_type == 'image/jpeg':
2717 content_type = mime_type
2718 elif codecs.split('.')[0] == 'stpp':
2719 content_type = 'text'
2720 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2721 content_type = 'text'
2722 else:
2723 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2724 continue
2725
2726 base_url = ''
2727 for element in (representation, adaptation_set, period, mpd_doc):
2728 base_url_e = element.find(_add_ns('BaseURL'))
2729 if base_url_e is not None:
2730 base_url = base_url_e.text + base_url
2731 if re.match(r'^https?://', base_url):
2732 break
2733 if mpd_base_url and base_url.startswith('/'):
2734 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2735 elif mpd_base_url and not re.match(r'^https?://', base_url):
2736 if not mpd_base_url.endswith('/'):
2737 mpd_base_url += '/'
2738 base_url = mpd_base_url + base_url
2739 representation_id = representation_attrib.get('id')
2740 lang = representation_attrib.get('lang')
2741 url_el = representation.find(_add_ns('BaseURL'))
2742 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2743 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2744 if representation_id is not None:
2745 format_id = representation_id
2746 else:
2747 format_id = content_type
2748 if mpd_id:
2749 format_id = mpd_id + '-' + format_id
2750 if content_type in ('video', 'audio'):
2751 f = {
2752 'format_id': format_id,
2753 'manifest_url': mpd_url,
2754 'ext': mimetype2ext(mime_type),
2755 'width': int_or_none(representation_attrib.get('width')),
2756 'height': int_or_none(representation_attrib.get('height')),
2757 'tbr': float_or_none(bandwidth, 1000),
2758 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2759 'fps': int_or_none(representation_attrib.get('frameRate')),
2760 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2761 'format_note': 'DASH %s' % content_type,
2762 'filesize': filesize,
2763 'container': mimetype2ext(mime_type) + '_dash',
2764 }
2765 f.update(parse_codecs(codecs))
2766 elif content_type == 'text':
2767 f = {
2768 'ext': mimetype2ext(mime_type),
2769 'manifest_url': mpd_url,
2770 'filesize': filesize,
2771 }
2772 elif content_type == 'image/jpeg':
2773 # See test case in VikiIE
2774 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2775 f = {
2776 'format_id': format_id,
2777 'ext': 'mhtml',
2778 'manifest_url': mpd_url,
2779 'format_note': 'DASH storyboards (jpeg)',
2780 'acodec': 'none',
2781 'vcodec': 'none',
2782 }
2783 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2784 f['has_drm'] = True
2785 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2786
2787 def prepare_template(template_name, identifiers):
2788 tmpl = representation_ms_info[template_name]
2789 # First of, % characters outside $...$ templates
2790 # must be escaped by doubling for proper processing
2791 # by % operator string formatting used further (see
2792 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2793 t = ''
2794 in_template = False
2795 for c in tmpl:
2796 t += c
2797 if c == '$':
2798 in_template = not in_template
2799 elif c == '%' and not in_template:
2800 t += c
2801 # Next, $...$ templates are translated to their
2802 # %(...) counterparts to be used with % operator
2803 if representation_id is not None:
2804 t = t.replace('$RepresentationID$', representation_id)
2805 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2806 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2807 t.replace('$$', '$')
2808 return t
2809
2810 # @initialization is a regular template like @media one
2811 # so it should be handled just the same way (see
2812 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2813 if 'initialization' in representation_ms_info:
2814 initialization_template = prepare_template(
2815 'initialization',
2816 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2817 # $Time$ shall not be included for @initialization thus
2818 # only $Bandwidth$ remains
2819 ('Bandwidth', ))
2820 representation_ms_info['initialization_url'] = initialization_template % {
2821 'Bandwidth': bandwidth,
2822 }
2823
2824 def location_key(location):
2825 return 'url' if re.match(r'^https?://', location) else 'path'
2826
2827 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2828
2829 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2830 media_location_key = location_key(media_template)
2831
2832 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2833 # can't be used at the same time
2834 if '%(Number' in media_template and 's' not in representation_ms_info:
2835 segment_duration = None
2836 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2837 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2838 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2839 representation_ms_info['fragments'] = [{
2840 media_location_key: media_template % {
2841 'Number': segment_number,
2842 'Bandwidth': bandwidth,
2843 },
2844 'duration': segment_duration,
2845 } for segment_number in range(
2846 representation_ms_info['start_number'],
2847 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2848 else:
2849 # $Number*$ or $Time$ in media template with S list available
2850 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2851 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2852 representation_ms_info['fragments'] = []
2853 segment_time = 0
2854 segment_d = None
2855 segment_number = representation_ms_info['start_number']
2856
2857 def add_segment_url():
2858 segment_url = media_template % {
2859 'Time': segment_time,
2860 'Bandwidth': bandwidth,
2861 'Number': segment_number,
2862 }
2863 representation_ms_info['fragments'].append({
2864 media_location_key: segment_url,
2865 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2866 })
2867
2868 for num, s in enumerate(representation_ms_info['s']):
2869 segment_time = s.get('t') or segment_time
2870 segment_d = s['d']
2871 add_segment_url()
2872 segment_number += 1
2873 for r in range(s.get('r', 0)):
2874 segment_time += segment_d
2875 add_segment_url()
2876 segment_number += 1
2877 segment_time += segment_d
2878 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2879 # No media template
2880 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2881 # or any YouTube dashsegments video
2882 fragments = []
2883 segment_index = 0
2884 timescale = representation_ms_info['timescale']
2885 for s in representation_ms_info['s']:
2886 duration = float_or_none(s['d'], timescale)
2887 for r in range(s.get('r', 0) + 1):
2888 segment_uri = representation_ms_info['segment_urls'][segment_index]
2889 fragments.append({
2890 location_key(segment_uri): segment_uri,
2891 'duration': duration,
2892 })
2893 segment_index += 1
2894 representation_ms_info['fragments'] = fragments
2895 elif 'segment_urls' in representation_ms_info:
2896 # Segment URLs with no SegmentTimeline
2897 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2898 # https://github.com/ytdl-org/youtube-dl/pull/14844
2899 fragments = []
2900 segment_duration = float_or_none(
2901 representation_ms_info['segment_duration'],
2902 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2903 for segment_url in representation_ms_info['segment_urls']:
2904 fragment = {
2905 location_key(segment_url): segment_url,
2906 }
2907 if segment_duration:
2908 fragment['duration'] = segment_duration
2909 fragments.append(fragment)
2910 representation_ms_info['fragments'] = fragments
2911 # If there is a fragments key available then we correctly recognized fragmented media.
2912 # Otherwise we will assume unfragmented media with direct access. Technically, such
2913 # assumption is not necessarily correct since we may simply have no support for
2914 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2915 if 'fragments' in representation_ms_info:
2916 f.update({
2917 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2918 'url': mpd_url or base_url,
2919 'fragment_base_url': base_url,
2920 'fragments': [],
2921 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2922 })
2923 if 'initialization_url' in representation_ms_info:
2924 initialization_url = representation_ms_info['initialization_url']
2925 if not f.get('url'):
2926 f['url'] = initialization_url
2927 f['fragments'].append({location_key(initialization_url): initialization_url})
2928 f['fragments'].extend(representation_ms_info['fragments'])
2929 else:
2930 # Assuming direct URL to unfragmented media.
2931 f['url'] = base_url
2932 if content_type in ('video', 'audio', 'image/jpeg'):
2933 f['manifest_stream_number'] = stream_numbers[f['url']]
2934 stream_numbers[f['url']] += 1
2935 formats.append(f)
2936 elif content_type == 'text':
2937 subtitles.setdefault(lang or 'und', []).append(f)
2938
2939 return formats, subtitles
2940
2941 def _extract_ism_formats(self, *args, **kwargs):
2942 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2943 if subs:
2944 self._report_ignoring_subs('ISM')
2945 return fmts
2946
2947 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2948 res = self._download_xml_handle(
2949 ism_url, video_id,
2950 note='Downloading ISM manifest' if note is None else note,
2951 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2952 fatal=fatal, data=data, headers=headers, query=query)
2953 if res is False:
2954 return [], {}
2955 ism_doc, urlh = res
2956 if ism_doc is None:
2957 return [], {}
2958
2959 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2960
2961 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2962 """
2963 Parse formats from ISM manifest.
2964 References:
2965 1. [MS-SSTR]: Smooth Streaming Protocol,
2966 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2967 """
2968 if ism_doc.get('IsLive') == 'TRUE':
2969 return [], {}
2970
2971 duration = int(ism_doc.attrib['Duration'])
2972 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2973
2974 formats = []
2975 subtitles = {}
2976 for stream in ism_doc.findall('StreamIndex'):
2977 stream_type = stream.get('Type')
2978 if stream_type not in ('video', 'audio', 'text'):
2979 continue
2980 url_pattern = stream.attrib['Url']
2981 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2982 stream_name = stream.get('Name')
2983 stream_language = stream.get('Language', 'und')
2984 for track in stream.findall('QualityLevel'):
2985 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2986 # TODO: add support for WVC1 and WMAP
2987 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2988 self.report_warning('%s is not a supported codec' % fourcc)
2989 continue
2990 tbr = int(track.attrib['Bitrate']) // 1000
2991 # [1] does not mention Width and Height attributes. However,
2992 # they're often present while MaxWidth and MaxHeight are
2993 # missing, so should be used as fallbacks
2994 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2995 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2996 sampling_rate = int_or_none(track.get('SamplingRate'))
2997
2998 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2999 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3000
3001 fragments = []
3002 fragment_ctx = {
3003 'time': 0,
3004 }
3005 stream_fragments = stream.findall('c')
3006 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3007 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3008 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3009 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3010 if not fragment_ctx['duration']:
3011 try:
3012 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3013 except IndexError:
3014 next_fragment_time = duration
3015 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3016 for _ in range(fragment_repeat):
3017 fragments.append({
3018 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3019 'duration': fragment_ctx['duration'] / stream_timescale,
3020 })
3021 fragment_ctx['time'] += fragment_ctx['duration']
3022
3023 if stream_type == 'text':
3024 subtitles.setdefault(stream_language, []).append({
3025 'ext': 'ismt',
3026 'protocol': 'ism',
3027 'url': ism_url,
3028 'manifest_url': ism_url,
3029 'fragments': fragments,
3030 '_download_params': {
3031 'stream_type': stream_type,
3032 'duration': duration,
3033 'timescale': stream_timescale,
3034 'fourcc': fourcc,
3035 'language': stream_language,
3036 'codec_private_data': track.get('CodecPrivateData'),
3037 }
3038 })
3039 elif stream_type in ('video', 'audio'):
3040 formats.append({
3041 'format_id': join_nonempty(ism_id, stream_name, tbr),
3042 'url': ism_url,
3043 'manifest_url': ism_url,
3044 'ext': 'ismv' if stream_type == 'video' else 'isma',
3045 'width': width,
3046 'height': height,
3047 'tbr': tbr,
3048 'asr': sampling_rate,
3049 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3050 'acodec': 'none' if stream_type == 'video' else fourcc,
3051 'protocol': 'ism',
3052 'fragments': fragments,
3053 'has_drm': ism_doc.find('Protection') is not None,
3054 '_download_params': {
3055 'stream_type': stream_type,
3056 'duration': duration,
3057 'timescale': stream_timescale,
3058 'width': width or 0,
3059 'height': height or 0,
3060 'fourcc': fourcc,
3061 'language': stream_language,
3062 'codec_private_data': track.get('CodecPrivateData'),
3063 'sampling_rate': sampling_rate,
3064 'channels': int_or_none(track.get('Channels', 2)),
3065 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3066 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3067 },
3068 })
3069 return formats, subtitles
3070
3071 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3072 def absolute_url(item_url):
3073 return urljoin(base_url, item_url)
3074
3075 def parse_content_type(content_type):
3076 if not content_type:
3077 return {}
3078 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3079 if ctr:
3080 mimetype, codecs = ctr.groups()
3081 f = parse_codecs(codecs)
3082 f['ext'] = mimetype2ext(mimetype)
3083 return f
3084 return {}
3085
3086 def _media_formats(src, cur_media_type, type_info={}):
3087 full_url = absolute_url(src)
3088 ext = type_info.get('ext') or determine_ext(full_url)
3089 if ext == 'm3u8':
3090 is_plain_url = False
3091 formats = self._extract_m3u8_formats(
3092 full_url, video_id, ext='mp4',
3093 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3094 preference=preference, quality=quality, fatal=False)
3095 elif ext == 'mpd':
3096 is_plain_url = False
3097 formats = self._extract_mpd_formats(
3098 full_url, video_id, mpd_id=mpd_id, fatal=False)
3099 else:
3100 is_plain_url = True
3101 formats = [{
3102 'url': full_url,
3103 'vcodec': 'none' if cur_media_type == 'audio' else None,
3104 }]
3105 return is_plain_url, formats
3106
3107 entries = []
3108 # amp-video and amp-audio are very similar to their HTML5 counterparts
3109 # so we wll include them right here (see
3110 # https://www.ampproject.org/docs/reference/components/amp-video)
3111 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3112 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3113 media_tags = [(media_tag, media_tag_name, media_type, '')
3114 for media_tag, media_tag_name, media_type
3115 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3116 media_tags.extend(re.findall(
3117 # We only allow video|audio followed by a whitespace or '>'.
3118 # Allowing more characters may end up in significant slow down (see
3119 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3120 # http://www.porntrex.com/maps/videositemap.xml).
3121 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3122 for media_tag, _, media_type, media_content in media_tags:
3123 media_info = {
3124 'formats': [],
3125 'subtitles': {},
3126 }
3127 media_attributes = extract_attributes(media_tag)
3128 src = strip_or_none(media_attributes.get('src'))
3129 if src:
3130 _, formats = _media_formats(src, media_type)
3131 media_info['formats'].extend(formats)
3132 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3133 if media_content:
3134 for source_tag in re.findall(r'<source[^>]+>', media_content):
3135 s_attr = extract_attributes(source_tag)
3136 # data-video-src and data-src are non standard but seen
3137 # several times in the wild
3138 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3139 if not src:
3140 continue
3141 f = parse_content_type(s_attr.get('type'))
3142 is_plain_url, formats = _media_formats(src, media_type, f)
3143 if is_plain_url:
3144 # width, height, res, label and title attributes are
3145 # all not standard but seen several times in the wild
3146 labels = [
3147 s_attr.get(lbl)
3148 for lbl in ('label', 'title')
3149 if str_or_none(s_attr.get(lbl))
3150 ]
3151 width = int_or_none(s_attr.get('width'))
3152 height = (int_or_none(s_attr.get('height'))
3153 or int_or_none(s_attr.get('res')))
3154 if not width or not height:
3155 for lbl in labels:
3156 resolution = parse_resolution(lbl)
3157 if not resolution:
3158 continue
3159 width = width or resolution.get('width')
3160 height = height or resolution.get('height')
3161 for lbl in labels:
3162 tbr = parse_bitrate(lbl)
3163 if tbr:
3164 break
3165 else:
3166 tbr = None
3167 f.update({
3168 'width': width,
3169 'height': height,
3170 'tbr': tbr,
3171 'format_id': s_attr.get('label') or s_attr.get('title'),
3172 })
3173 f.update(formats[0])
3174 media_info['formats'].append(f)
3175 else:
3176 media_info['formats'].extend(formats)
3177 for track_tag in re.findall(r'<track[^>]+>', media_content):
3178 track_attributes = extract_attributes(track_tag)
3179 kind = track_attributes.get('kind')
3180 if not kind or kind in ('subtitles', 'captions'):
3181 src = strip_or_none(track_attributes.get('src'))
3182 if not src:
3183 continue
3184 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3185 media_info['subtitles'].setdefault(lang, []).append({
3186 'url': absolute_url(src),
3187 })
3188 for f in media_info['formats']:
3189 f.setdefault('http_headers', {})['Referer'] = base_url
3190 if media_info['formats'] or media_info['subtitles']:
3191 entries.append(media_info)
3192 return entries
3193
3194 def _extract_akamai_formats(self, *args, **kwargs):
3195 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3196 if subs:
3197 self._report_ignoring_subs('akamai')
3198 return fmts
3199
3200 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3201 signed = 'hdnea=' in manifest_url
3202 if not signed:
3203 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3204 manifest_url = re.sub(
3205 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3206 '', manifest_url).strip('?')
3207
3208 formats = []
3209 subtitles = {}
3210
3211 hdcore_sign = 'hdcore=3.7.0'
3212 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3213 hds_host = hosts.get('hds')
3214 if hds_host:
3215 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3216 if 'hdcore=' not in f4m_url:
3217 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3218 f4m_formats = self._extract_f4m_formats(
3219 f4m_url, video_id, f4m_id='hds', fatal=False)
3220 for entry in f4m_formats:
3221 entry.update({'extra_param_to_segment_url': hdcore_sign})
3222 formats.extend(f4m_formats)
3223
3224 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3225 hls_host = hosts.get('hls')
3226 if hls_host:
3227 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3228 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3229 m3u8_url, video_id, 'mp4', 'm3u8_native',
3230 m3u8_id='hls', fatal=False)
3231 formats.extend(m3u8_formats)
3232 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3233
3234 http_host = hosts.get('http')
3235 if http_host and m3u8_formats and not signed:
3236 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3237 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3238 qualities_length = len(qualities)
3239 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3240 i = 0
3241 for f in m3u8_formats:
3242 if f['vcodec'] != 'none':
3243 for protocol in ('http', 'https'):
3244 http_f = f.copy()
3245 del http_f['manifest_url']
3246 http_url = re.sub(
3247 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3248 http_f.update({
3249 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3250 'url': http_url,
3251 'protocol': protocol,
3252 })
3253 formats.append(http_f)
3254 i += 1
3255
3256 return formats, subtitles
3257
3258 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3259 query = compat_urlparse.urlparse(url).query
3260 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3261 mobj = re.search(
3262 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3263 url_base = mobj.group('url')
3264 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3265 formats = []
3266
3267 def manifest_url(manifest):
3268 m_url = '%s/%s' % (http_base_url, manifest)
3269 if query:
3270 m_url += '?%s' % query
3271 return m_url
3272
3273 if 'm3u8' not in skip_protocols:
3274 formats.extend(self._extract_m3u8_formats(
3275 manifest_url('playlist.m3u8'), video_id, 'mp4',
3276 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3277 if 'f4m' not in skip_protocols:
3278 formats.extend(self._extract_f4m_formats(
3279 manifest_url('manifest.f4m'),
3280 video_id, f4m_id='hds', fatal=False))
3281 if 'dash' not in skip_protocols:
3282 formats.extend(self._extract_mpd_formats(
3283 manifest_url('manifest.mpd'),
3284 video_id, mpd_id='dash', fatal=False))
3285 if re.search(r'(?:/smil:|\.smil)', url_base):
3286 if 'smil' not in skip_protocols:
3287 rtmp_formats = self._extract_smil_formats(
3288 manifest_url('jwplayer.smil'),
3289 video_id, fatal=False)
3290 for rtmp_format in rtmp_formats:
3291 rtsp_format = rtmp_format.copy()
3292 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3293 del rtsp_format['play_path']
3294 del rtsp_format['ext']
3295 rtsp_format.update({
3296 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3297 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3298 'protocol': 'rtsp',
3299 })
3300 formats.extend([rtmp_format, rtsp_format])
3301 else:
3302 for protocol in ('rtmp', 'rtsp'):
3303 if protocol not in skip_protocols:
3304 formats.append({
3305 'url': '%s:%s' % (protocol, url_base),
3306 'format_id': protocol,
3307 'protocol': protocol,
3308 })
3309 return formats
3310
3311 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3312 mobj = re.search(
3313 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3314 webpage)
3315 if mobj:
3316 try:
3317 jwplayer_data = self._parse_json(mobj.group('options'),
3318 video_id=video_id,
3319 transform_source=transform_source)
3320 except ExtractorError:
3321 pass
3322 else:
3323 if isinstance(jwplayer_data, dict):
3324 return jwplayer_data
3325
3326 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3327 jwplayer_data = self._find_jwplayer_data(
3328 webpage, video_id, transform_source=js_to_json)
3329 return self._parse_jwplayer_data(
3330 jwplayer_data, video_id, *args, **kwargs)
3331
3332 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3333 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3334 # JWPlayer backward compatibility: flattened playlists
3335 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3336 if 'playlist' not in jwplayer_data:
3337 jwplayer_data = {'playlist': [jwplayer_data]}
3338
3339 entries = []
3340
3341 # JWPlayer backward compatibility: single playlist item
3342 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3343 if not isinstance(jwplayer_data['playlist'], list):
3344 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3345
3346 for video_data in jwplayer_data['playlist']:
3347 # JWPlayer backward compatibility: flattened sources
3348 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3349 if 'sources' not in video_data:
3350 video_data['sources'] = [video_data]
3351
3352 this_video_id = video_id or video_data['mediaid']
3353
3354 formats = self._parse_jwplayer_formats(
3355 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3356 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3357
3358 subtitles = {}
3359 tracks = video_data.get('tracks')
3360 if tracks and isinstance(tracks, list):
3361 for track in tracks:
3362 if not isinstance(track, dict):
3363 continue
3364 track_kind = track.get('kind')
3365 if not track_kind or not isinstance(track_kind, compat_str):
3366 continue
3367 if track_kind.lower() not in ('captions', 'subtitles'):
3368 continue
3369 track_url = urljoin(base_url, track.get('file'))
3370 if not track_url:
3371 continue
3372 subtitles.setdefault(track.get('label') or 'en', []).append({
3373 'url': self._proto_relative_url(track_url)
3374 })
3375
3376 entry = {
3377 'id': this_video_id,
3378 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3379 'description': clean_html(video_data.get('description')),
3380 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3381 'timestamp': int_or_none(video_data.get('pubdate')),
3382 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3383 'subtitles': subtitles,
3384 }
3385 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3386 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3387 entry.update({
3388 '_type': 'url_transparent',
3389 'url': formats[0]['url'],
3390 })
3391 else:
3392 self._sort_formats(formats)
3393 entry['formats'] = formats
3394 entries.append(entry)
3395 if len(entries) == 1:
3396 return entries[0]
3397 else:
3398 return self.playlist_result(entries)
3399
3400 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3401 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3402 urls = []
3403 formats = []
3404 for source in jwplayer_sources_data:
3405 if not isinstance(source, dict):
3406 continue
3407 source_url = urljoin(
3408 base_url, self._proto_relative_url(source.get('file')))
3409 if not source_url or source_url in urls:
3410 continue
3411 urls.append(source_url)
3412 source_type = source.get('type') or ''
3413 ext = mimetype2ext(source_type) or determine_ext(source_url)
3414 if source_type == 'hls' or ext == 'm3u8':
3415 formats.extend(self._extract_m3u8_formats(
3416 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3417 m3u8_id=m3u8_id, fatal=False))
3418 elif source_type == 'dash' or ext == 'mpd':
3419 formats.extend(self._extract_mpd_formats(
3420 source_url, video_id, mpd_id=mpd_id, fatal=False))
3421 elif ext == 'smil':
3422 formats.extend(self._extract_smil_formats(
3423 source_url, video_id, fatal=False))
3424 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3425 elif source_type.startswith('audio') or ext in (
3426 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3427 formats.append({
3428 'url': source_url,
3429 'vcodec': 'none',
3430 'ext': ext,
3431 })
3432 else:
3433 height = int_or_none(source.get('height'))
3434 if height is None:
3435 # Often no height is provided but there is a label in
3436 # format like "1080p", "720p SD", or 1080.
3437 height = int_or_none(self._search_regex(
3438 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3439 'height', default=None))
3440 a_format = {
3441 'url': source_url,
3442 'width': int_or_none(source.get('width')),
3443 'height': height,
3444 'tbr': int_or_none(source.get('bitrate')),
3445 'ext': ext,
3446 }
3447 if source_url.startswith('rtmp'):
3448 a_format['ext'] = 'flv'
3449 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3450 # of jwplayer.flash.swf
3451 rtmp_url_parts = re.split(
3452 r'((?:mp4|mp3|flv):)', source_url, 1)
3453 if len(rtmp_url_parts) == 3:
3454 rtmp_url, prefix, play_path = rtmp_url_parts
3455 a_format.update({
3456 'url': rtmp_url,
3457 'play_path': prefix + play_path,
3458 })
3459 if rtmp_params:
3460 a_format.update(rtmp_params)
3461 formats.append(a_format)
3462 return formats
3463
3464 def _live_title(self, name):
3465 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3466 return name
3467
3468 def _int(self, v, name, fatal=False, **kwargs):
3469 res = int_or_none(v, **kwargs)
3470 if 'get_attr' in kwargs:
3471 print(getattr(v, kwargs['get_attr']))
3472 if res is None:
3473 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3474 if fatal:
3475 raise ExtractorError(msg)
3476 else:
3477 self.report_warning(msg)
3478 return res
3479
3480 def _float(self, v, name, fatal=False, **kwargs):
3481 res = float_or_none(v, **kwargs)
3482 if res is None:
3483 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3484 if fatal:
3485 raise ExtractorError(msg)
3486 else:
3487 self.report_warning(msg)
3488 return res
3489
3490 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3491 path='/', secure=False, discard=False, rest={}, **kwargs):
3492 cookie = compat_cookiejar_Cookie(
3493 0, name, value, port, port is not None, domain, True,
3494 domain.startswith('.'), path, True, secure, expire_time,
3495 discard, None, None, rest)
3496 self._downloader.cookiejar.set_cookie(cookie)
3497
3498 def _get_cookies(self, url):
3499 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3500 req = sanitized_Request(url)
3501 self._downloader.cookiejar.add_cookie_header(req)
3502 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3503
3504 def _apply_first_set_cookie_header(self, url_handle, cookie):
3505 """
3506 Apply first Set-Cookie header instead of the last. Experimental.
3507
3508 Some sites (e.g. [1-3]) may serve two cookies under the same name
3509 in Set-Cookie header and expect the first (old) one to be set rather
3510 than second (new). However, as of RFC6265 the newer one cookie
3511 should be set into cookie store what actually happens.
3512 We will workaround this issue by resetting the cookie to
3513 the first one manually.
3514 1. https://new.vk.com/
3515 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3516 3. https://learning.oreilly.com/
3517 """
3518 for header, cookies in url_handle.headers.items():
3519 if header.lower() != 'set-cookie':
3520 continue
3521 if sys.version_info[0] >= 3:
3522 cookies = cookies.encode('iso-8859-1')
3523 cookies = cookies.decode('utf-8')
3524 cookie_value = re.search(
3525 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3526 if cookie_value:
3527 value, domain = cookie_value.groups()
3528 self._set_cookie(domain, cookie, value)
3529 break
3530
3531 def get_testcases(self, include_onlymatching=False):
3532 t = getattr(self, '_TEST', None)
3533 if t:
3534 assert not hasattr(self, '_TESTS'), \
3535 '%s has _TEST and _TESTS' % type(self).__name__
3536 tests = [t]
3537 else:
3538 tests = getattr(self, '_TESTS', [])
3539 for t in tests:
3540 if not include_onlymatching and t.get('only_matching', False):
3541 continue
3542 t['name'] = type(self).__name__[:-len('IE')]
3543 yield t
3544
3545 def is_suitable(self, age_limit):
3546 """ Test whether the extractor is generally suitable for the given
3547 age limit (i.e. pornographic sites are not, all others usually are) """
3548
3549 any_restricted = False
3550 for tc in self.get_testcases(include_onlymatching=False):
3551 if tc.get('playlist', []):
3552 tc = tc['playlist'][0]
3553 is_restricted = age_restricted(
3554 tc.get('info_dict', {}).get('age_limit'), age_limit)
3555 if not is_restricted:
3556 return True
3557 any_restricted = any_restricted or is_restricted
3558 return not any_restricted
3559
3560 def extract_subtitles(self, *args, **kwargs):
3561 if (self.get_param('writesubtitles', False)
3562 or self.get_param('listsubtitles')):
3563 return self._get_subtitles(*args, **kwargs)
3564 return {}
3565
3566 def _get_subtitles(self, *args, **kwargs):
3567 raise NotImplementedError('This method must be implemented by subclasses')
3568
3569 def extract_comments(self, *args, **kwargs):
3570 if not self.get_param('getcomments'):
3571 return None
3572 generator = self._get_comments(*args, **kwargs)
3573
3574 def extractor():
3575 comments = []
3576 interrupted = True
3577 try:
3578 while True:
3579 comments.append(next(generator))
3580 except StopIteration:
3581 interrupted = False
3582 except KeyboardInterrupt:
3583 self.to_screen('Interrupted by user')
3584 except Exception as e:
3585 if self.get_param('ignoreerrors') is not True:
3586 raise
3587 self._downloader.report_error(e)
3588 comment_count = len(comments)
3589 self.to_screen(f'Extracted {comment_count} comments')
3590 return {
3591 'comments': comments,
3592 'comment_count': None if interrupted else comment_count
3593 }
3594 return extractor
3595
3596 def _get_comments(self, *args, **kwargs):
3597 raise NotImplementedError('This method must be implemented by subclasses')
3598
3599 @staticmethod
3600 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3601 """ Merge subtitle items for one language. Items with duplicated URLs
3602 will be dropped. """
3603 list1_urls = set([item['url'] for item in subtitle_list1])
3604 ret = list(subtitle_list1)
3605 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3606 return ret
3607
3608 @classmethod
3609 def _merge_subtitles(cls, *dicts, target=None):
3610 """ Merge subtitle dictionaries, language by language. """
3611 if target is None:
3612 target = {}
3613 for d in dicts:
3614 for lang, subs in d.items():
3615 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3616 return target
3617
3618 def extract_automatic_captions(self, *args, **kwargs):
3619 if (self.get_param('writeautomaticsub', False)
3620 or self.get_param('listsubtitles')):
3621 return self._get_automatic_captions(*args, **kwargs)
3622 return {}
3623
3624 def _get_automatic_captions(self, *args, **kwargs):
3625 raise NotImplementedError('This method must be implemented by subclasses')
3626
3627 def mark_watched(self, *args, **kwargs):
3628 if not self.get_param('mark_watched', False):
3629 return
3630 if (self._get_login_info()[0] is not None
3631 or self.get_param('cookiefile')
3632 or self.get_param('cookiesfrombrowser')):
3633 self._mark_watched(*args, **kwargs)
3634
3635 def _mark_watched(self, *args, **kwargs):
3636 raise NotImplementedError('This method must be implemented by subclasses')
3637
3638 def geo_verification_headers(self):
3639 headers = {}
3640 geo_verification_proxy = self.get_param('geo_verification_proxy')
3641 if geo_verification_proxy:
3642 headers['Ytdl-request-proxy'] = geo_verification_proxy
3643 return headers
3644
3645 def _generic_id(self, url):
3646 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3647
3648 def _generic_title(self, url):
3649 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3650
3651 @staticmethod
3652 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3653 all_known = all(map(
3654 lambda x: x is not None,
3655 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3656 return (
3657 'private' if is_private
3658 else 'premium_only' if needs_premium
3659 else 'subscriber_only' if needs_subscription
3660 else 'needs_auth' if needs_auth
3661 else 'unlisted' if is_unlisted
3662 else 'public' if all_known
3663 else None)
3664
3665 def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3666 '''
3667 @returns A list of values for the extractor argument given by "key"
3668 or "default" if no such key is present
3669 @param default The default value to return when the key is not present (default: [])
3670 @param casesense When false, the values are converted to lower case
3671 '''
3672 val = traverse_obj(
3673 self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3674 if val is None:
3675 return [] if default is NO_DEFAULT else default
3676 return list(val) if casesense else [x.lower() for x in val]
3677
3678
3679 class SearchInfoExtractor(InfoExtractor):
3680 """
3681 Base class for paged search queries extractors.
3682 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3683 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3684 """
3685
3686 _MAX_RESULTS = float('inf')
3687
3688 @classmethod
3689 def _make_valid_url(cls):
3690 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3691
3692 def _real_extract(self, query):
3693 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3694 if prefix == '':
3695 return self._get_n_results(query, 1)
3696 elif prefix == 'all':
3697 return self._get_n_results(query, self._MAX_RESULTS)
3698 else:
3699 n = int(prefix)
3700 if n <= 0:
3701 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3702 elif n > self._MAX_RESULTS:
3703 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3704 n = self._MAX_RESULTS
3705 return self._get_n_results(query, n)
3706
3707 def _get_n_results(self, query, n):
3708 """Get a specified number of results for a query.
3709 Either this function or _search_results must be overridden by subclasses """
3710 return self.playlist_result(
3711 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3712 query, query)
3713
3714 def _search_results(self, query):
3715 """Returns an iterator of search results"""
3716 raise NotImplementedError('This method must be implemented by subclasses')
3717
3718 @property
3719 def SEARCH_KEY(self):
3720 return self._SEARCH_KEY