]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[extractor] Support `--mark-watched` without `_NETRC_MACHINE` (#2939)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import collections
6 import hashlib
7 import itertools
8 import json
9 import netrc
10 import os
11 import random
12 import re
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar_Cookie,
19 compat_cookies_SimpleCookie,
20 compat_etree_Element,
21 compat_etree_fromstring,
22 compat_expanduser,
23 compat_getpass,
24 compat_http_client,
25 compat_os_name,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
31 compat_urlparse,
32 compat_xml_parse_error,
33 )
34 from ..downloader import FileDownloader
35 from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38 )
39 from ..utils import (
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 dict_get,
48 encode_data_uri,
49 error_to_compat_str,
50 extract_attributes,
51 ExtractorError,
52 fix_xml_ampersands,
53 float_or_none,
54 format_field,
55 GeoRestrictedError,
56 GeoUtils,
57 int_or_none,
58 join_nonempty,
59 js_to_json,
60 JSON_LD_RE,
61 mimetype2ext,
62 network_exceptions,
63 NO_DEFAULT,
64 orderedSet,
65 parse_bitrate,
66 parse_codecs,
67 parse_duration,
68 parse_iso8601,
69 parse_m3u8_attributes,
70 parse_resolution,
71 RegexNotFoundError,
72 sanitize_filename,
73 sanitized_Request,
74 str_or_none,
75 str_to_int,
76 strip_or_none,
77 traverse_obj,
78 try_get,
79 unescapeHTML,
80 UnsupportedError,
81 unified_strdate,
82 unified_timestamp,
83 update_Request,
84 update_url_query,
85 url_basename,
86 url_or_none,
87 urljoin,
88 variadic,
89 xpath_element,
90 xpath_text,
91 xpath_with_ns,
92 )
93
94
95 class InfoExtractor(object):
96 """Information Extractor class.
97
98 Information extractors are the classes that, given a URL, extract
99 information about the video (or videos) the URL refers to. This
100 information includes the real video URL, the video title, author and
101 others. The information is stored in a dictionary which is then
102 passed to the YoutubeDL. The YoutubeDL processes this
103 information possibly downloading the video to the file system, among
104 other possible outcomes.
105
106 The type field determines the type of the result.
107 By far the most common value (and the default if _type is missing) is
108 "video", which indicates a single video.
109
110 For a video, the dictionaries must include the following fields:
111
112 id: Video identifier.
113 title: Video title, unescaped.
114
115 Additionally, it must contain either a formats entry or a url one:
116
117 formats: A list of dictionaries for each format available, ordered
118 from worst to best quality.
119
120 Potential fields:
121 * url The mandatory URL representing the media:
122 for plain file media - HTTP URL of this file,
123 for RTMP - RTMP URL,
124 for HLS - URL of the M3U8 media playlist,
125 for HDS - URL of the F4M manifest,
126 for DASH
127 - HTTP URL to plain file media (in case of
128 unfragmented media)
129 - URL of the MPD manifest or base URL
130 representing the media if MPD manifest
131 is parsed from a string (in case of
132 fragmented media)
133 for MSS - URL of the ISM manifest.
134 * manifest_url
135 The URL of the manifest file in case of
136 fragmented media:
137 for HLS - URL of the M3U8 master playlist,
138 for HDS - URL of the F4M manifest,
139 for DASH - URL of the MPD manifest,
140 for MSS - URL of the ISM manifest.
141 * ext Will be calculated from URL if missing
142 * format A human-readable description of the format
143 ("mp4 container with h264/opus").
144 Calculated from the format_id, width, height.
145 and format_note fields if missing.
146 * format_id A short description of the format
147 ("mp4_h264_opus" or "19").
148 Technically optional, but strongly recommended.
149 * format_note Additional info about the format
150 ("3D" or "DASH video")
151 * width Width of the video, if known
152 * height Height of the video, if known
153 * resolution Textual description of width and height
154 * dynamic_range The dynamic range of the video. One of:
155 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
156 * tbr Average bitrate of audio and video in KBit/s
157 * abr Average audio bitrate in KBit/s
158 * acodec Name of the audio codec in use
159 * asr Audio sampling rate in Hertz
160 * vbr Average video bitrate in KBit/s
161 * fps Frame rate
162 * vcodec Name of the video codec in use
163 * container Name of the container format
164 * filesize The number of bytes, if known in advance
165 * filesize_approx An estimate for the number of bytes
166 * player_url SWF Player URL (used for rtmpdump).
167 * protocol The protocol that will be used for the actual
168 download, lower-case. One of "http", "https" or
169 one of the protocols defined in downloader.PROTOCOL_MAP
170 * fragment_base_url
171 Base URL for fragments. Each fragment's path
172 value (if present) will be relative to
173 this URL.
174 * fragments A list of fragments of a fragmented media.
175 Each fragment entry must contain either an url
176 or a path. If an url is present it should be
177 considered by a client. Otherwise both path and
178 fragment_base_url must be present. Here is
179 the list of all potential fields:
180 * "url" - fragment's URL
181 * "path" - fragment's path relative to
182 fragment_base_url
183 * "duration" (optional, int or float)
184 * "filesize" (optional, int)
185 * is_from_start Is a live format that can be downloaded
186 from the start. Boolean
187 * preference Order number of this format. If this field is
188 present and not None, the formats get sorted
189 by this field, regardless of all other values.
190 -1 for default (order by other properties),
191 -2 or smaller for less than default.
192 < -1000 to hide the format (if there is
193 another one which is strictly better)
194 * language Language code, e.g. "de" or "en-US".
195 * language_preference Is this in the language mentioned in
196 the URL?
197 10 if it's what the URL is about,
198 -1 for default (don't know),
199 -10 otherwise, other values reserved for now.
200 * quality Order number of the video quality of this
201 format, irrespective of the file format.
202 -1 for default (order by other properties),
203 -2 or smaller for less than default.
204 * source_preference Order number for this video source
205 (quality takes higher priority)
206 -1 for default (order by other properties),
207 -2 or smaller for less than default.
208 * http_headers A dictionary of additional HTTP headers
209 to add to the request.
210 * stretched_ratio If given and not 1, indicates that the
211 video's pixels are not square.
212 width : height ratio as float.
213 * no_resume The server does not support resuming the
214 (HTTP or RTMP) download. Boolean.
215 * has_drm The format has DRM and cannot be downloaded. Boolean
216 * downloader_options A dictionary of downloader options as
217 described in FileDownloader
218 RTMP formats can also have the additional fields: page_url,
219 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
220 rtmp_protocol, rtmp_real_time
221
222 url: Final video URL.
223 ext: Video filename extension.
224 format: The video format, defaults to ext (used for --get-format)
225 player_url: SWF Player URL (used for rtmpdump).
226
227 The following fields are optional:
228
229 alt_title: A secondary title of the video.
230 display_id An alternative identifier for the video, not necessarily
231 unique, but available before title. Typically, id is
232 something like "4234987", title "Dancing naked mole rats",
233 and display_id "dancing-naked-mole-rats"
234 thumbnails: A list of dictionaries, with the following entries:
235 * "id" (optional, string) - Thumbnail format ID
236 * "url"
237 * "preference" (optional, int) - quality of the image
238 * "width" (optional, int)
239 * "height" (optional, int)
240 * "resolution" (optional, string "{width}x{height}",
241 deprecated)
242 * "filesize" (optional, int)
243 * "http_headers" (dict) - HTTP headers for the request
244 thumbnail: Full URL to a video thumbnail image.
245 description: Full video description.
246 uploader: Full name of the video uploader.
247 license: License name the video is licensed under.
248 creator: The creator of the video.
249 timestamp: UNIX timestamp of the moment the video was uploaded
250 upload_date: Video upload date (YYYYMMDD).
251 If not explicitly set, calculated from timestamp
252 release_timestamp: UNIX timestamp of the moment the video was released.
253 If it is not clear whether to use timestamp or this, use the former
254 release_date: The date (YYYYMMDD) when the video was released.
255 If not explicitly set, calculated from release_timestamp
256 modified_timestamp: UNIX timestamp of the moment the video was last modified.
257 modified_date: The date (YYYYMMDD) when the video was last modified.
258 If not explicitly set, calculated from modified_timestamp
259 uploader_id: Nickname or id of the video uploader.
260 uploader_url: Full URL to a personal webpage of the video uploader.
261 channel: Full name of the channel the video is uploaded on.
262 Note that channel fields may or may not repeat uploader
263 fields. This depends on a particular extractor.
264 channel_id: Id of the channel.
265 channel_url: Full URL to a channel webpage.
266 channel_follower_count: Number of followers of the channel.
267 location: Physical location where the video was filmed.
268 subtitles: The available subtitles as a dictionary in the format
269 {tag: subformats}. "tag" is usually a language code, and
270 "subformats" is a list sorted from lower to higher
271 preference, each element is a dictionary with the "ext"
272 entry and one of:
273 * "data": The subtitles file contents
274 * "url": A URL pointing to the subtitles file
275 It can optionally also have:
276 * "name": Name or description of the subtitles
277 * http_headers: A dictionary of additional HTTP headers
278 to add to the request.
279 "ext" will be calculated from URL if missing
280 automatic_captions: Like 'subtitles'; contains automatically generated
281 captions instead of normal subtitles
282 duration: Length of the video in seconds, as an integer or float.
283 view_count: How many users have watched the video on the platform.
284 like_count: Number of positive ratings of the video
285 dislike_count: Number of negative ratings of the video
286 repost_count: Number of reposts of the video
287 average_rating: Average rating give by users, the scale used depends on the webpage
288 comment_count: Number of comments on the video
289 comments: A list of comments, each with one or more of the following
290 properties (all but one of text or html optional):
291 * "author" - human-readable name of the comment author
292 * "author_id" - user ID of the comment author
293 * "author_thumbnail" - The thumbnail of the comment author
294 * "id" - Comment ID
295 * "html" - Comment as HTML
296 * "text" - Plain text of the comment
297 * "timestamp" - UNIX timestamp of comment
298 * "parent" - ID of the comment this one is replying to.
299 Set to "root" to indicate that this is a
300 comment to the original video.
301 * "like_count" - Number of positive ratings of the comment
302 * "dislike_count" - Number of negative ratings of the comment
303 * "is_favorited" - Whether the comment is marked as
304 favorite by the video uploader
305 * "author_is_uploader" - Whether the comment is made by
306 the video uploader
307 age_limit: Age restriction for the video, as an integer (years)
308 webpage_url: The URL to the video webpage, if given to yt-dlp it
309 should allow to get the same result again. (It will be set
310 by YoutubeDL if it's missing)
311 categories: A list of categories that the video falls in, for example
312 ["Sports", "Berlin"]
313 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
314 cast: A list of the video cast
315 is_live: True, False, or None (=unknown). Whether this video is a
316 live stream that goes on instead of a fixed-length video.
317 was_live: True, False, or None (=unknown). Whether this video was
318 originally a live stream.
319 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
320 If absent, automatically set from is_live, was_live
321 start_time: Time in seconds where the reproduction should start, as
322 specified in the URL.
323 end_time: Time in seconds where the reproduction should end, as
324 specified in the URL.
325 chapters: A list of dictionaries, with the following entries:
326 * "start_time" - The start time of the chapter in seconds
327 * "end_time" - The end time of the chapter in seconds
328 * "title" (optional, string)
329 playable_in_embed: Whether this video is allowed to play in embedded
330 players on other sites. Can be True (=always allowed),
331 False (=never allowed), None (=unknown), or a string
332 specifying the criteria for embedability (Eg: 'whitelist')
333 availability: Under what condition the video is available. One of
334 'private', 'premium_only', 'subscriber_only', 'needs_auth',
335 'unlisted' or 'public'. Use 'InfoExtractor._availability'
336 to set it
337 __post_extractor: A function to be called just before the metadata is
338 written to either disk, logger or console. The function
339 must return a dict which will be added to the info_dict.
340 This is usefull for additional information that is
341 time-consuming to extract. Note that the fields thus
342 extracted will not be available to output template and
343 match_filter. So, only "comments" and "comment_count" are
344 currently allowed to be extracted via this method.
345
346 The following fields should only be used when the video belongs to some logical
347 chapter or section:
348
349 chapter: Name or title of the chapter the video belongs to.
350 chapter_number: Number of the chapter the video belongs to, as an integer.
351 chapter_id: Id of the chapter the video belongs to, as a unicode string.
352
353 The following fields should only be used when the video is an episode of some
354 series, programme or podcast:
355
356 series: Title of the series or programme the video episode belongs to.
357 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
358 season: Title of the season the video episode belongs to.
359 season_number: Number of the season the video episode belongs to, as an integer.
360 season_id: Id of the season the video episode belongs to, as a unicode string.
361 episode: Title of the video episode. Unlike mandatory video title field,
362 this field should denote the exact title of the video episode
363 without any kind of decoration.
364 episode_number: Number of the video episode within a season, as an integer.
365 episode_id: Id of the video episode, as a unicode string.
366
367 The following fields should only be used when the media is a track or a part of
368 a music album:
369
370 track: Title of the track.
371 track_number: Number of the track within an album or a disc, as an integer.
372 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
373 as a unicode string.
374 artist: Artist(s) of the track.
375 genre: Genre(s) of the track.
376 album: Title of the album the track belongs to.
377 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
378 album_artist: List of all artists appeared on the album (e.g.
379 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
380 and compilations).
381 disc_number: Number of the disc or other physical medium the track belongs to,
382 as an integer.
383 release_year: Year (YYYY) when the album was released.
384 composer: Composer of the piece
385
386 Unless mentioned otherwise, the fields should be Unicode strings.
387
388 Unless mentioned otherwise, None is equivalent to absence of information.
389
390
391 _type "playlist" indicates multiple videos.
392 There must be a key "entries", which is a list, an iterable, or a PagedList
393 object, each element of which is a valid dictionary by this specification.
394
395 Additionally, playlists can have "id", "title", and any other relevent
396 attributes with the same semantics as videos (see above).
397
398 It can also have the following optional fields:
399
400 playlist_count: The total number of videos in a playlist. If not given,
401 YoutubeDL tries to calculate it from "entries"
402
403
404 _type "multi_video" indicates that there are multiple videos that
405 form a single show, for examples multiple acts of an opera or TV episode.
406 It must have an entries key like a playlist and contain all the keys
407 required for a video at the same time.
408
409
410 _type "url" indicates that the video must be extracted from another
411 location, possibly by a different extractor. Its only required key is:
412 "url" - the next URL to extract.
413 The key "ie_key" can be set to the class name (minus the trailing "IE",
414 e.g. "Youtube") if the extractor class is known in advance.
415 Additionally, the dictionary may have any properties of the resolved entity
416 known in advance, for example "title" if the title of the referred video is
417 known ahead of time.
418
419
420 _type "url_transparent" entities have the same specification as "url", but
421 indicate that the given additional information is more precise than the one
422 associated with the resolved URL.
423 This is useful when a site employs a video service that hosts the video and
424 its technical metadata, but that video service does not embed a useful
425 title, description etc.
426
427
428 Subclasses of this one should re-define the _real_initialize() and
429 _real_extract() methods and define a _VALID_URL regexp.
430 Probably, they should also be added to the list of extractors.
431
432 Subclasses may also override suitable() if necessary, but ensure the function
433 signature is preserved and that this function imports everything it needs
434 (except other extractors), so that lazy_extractors works correctly
435
436 _GEO_BYPASS attribute may be set to False in order to disable
437 geo restriction bypass mechanisms for a particular extractor.
438 Though it won't disable explicit geo restriction bypass based on
439 country code provided with geo_bypass_country.
440
441 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
442 countries for this extractor. One of these countries will be used by
443 geo restriction bypass mechanism right away in order to bypass
444 geo restriction, of course, if the mechanism is not disabled.
445
446 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
447 IP blocks in CIDR notation for this extractor. One of these IP blocks
448 will be used by geo restriction bypass mechanism similarly
449 to _GEO_COUNTRIES.
450
451 The _WORKING attribute should be set to False for broken IEs
452 in order to warn the users and skip the tests.
453 """
454
455 _ready = False
456 _downloader = None
457 _x_forwarded_for_ip = None
458 _GEO_BYPASS = True
459 _GEO_COUNTRIES = None
460 _GEO_IP_BLOCKS = None
461 _WORKING = True
462
463 _LOGIN_HINTS = {
464 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
465 'cookies': (
466 'Use --cookies-from-browser or --cookies for the authentication. '
467 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
468 'password': 'Use --username and --password, or --netrc to provide account credentials',
469 }
470
471 def __init__(self, downloader=None):
472 """Constructor. Receives an optional downloader (a YoutubeDL instance).
473 If a downloader is not passed during initialization,
474 it must be set using "set_downloader()" before "extract()" is called"""
475 self._ready = False
476 self._x_forwarded_for_ip = None
477 self._printed_messages = set()
478 self.set_downloader(downloader)
479
480 @classmethod
481 def _match_valid_url(cls, url):
482 # This does not use has/getattr intentionally - we want to know whether
483 # we have cached the regexp for *this* class, whereas getattr would also
484 # match the superclass
485 if '_VALID_URL_RE' not in cls.__dict__:
486 if '_VALID_URL' not in cls.__dict__:
487 cls._VALID_URL = cls._make_valid_url()
488 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
489 return cls._VALID_URL_RE.match(url)
490
491 @classmethod
492 def suitable(cls, url):
493 """Receives a URL and returns True if suitable for this IE."""
494 # This function must import everything it needs (except other extractors),
495 # so that lazy_extractors works correctly
496 return cls._match_valid_url(url) is not None
497
498 @classmethod
499 def _match_id(cls, url):
500 return cls._match_valid_url(url).group('id')
501
502 @classmethod
503 def get_temp_id(cls, url):
504 try:
505 return cls._match_id(url)
506 except (IndexError, AttributeError):
507 return None
508
509 @classmethod
510 def working(cls):
511 """Getter method for _WORKING."""
512 return cls._WORKING
513
514 def initialize(self):
515 """Initializes an instance (authentication, etc)."""
516 self._printed_messages = set()
517 self._initialize_geo_bypass({
518 'countries': self._GEO_COUNTRIES,
519 'ip_blocks': self._GEO_IP_BLOCKS,
520 })
521 if not self._ready:
522 self._real_initialize()
523 self._ready = True
524
525 def _initialize_geo_bypass(self, geo_bypass_context):
526 """
527 Initialize geo restriction bypass mechanism.
528
529 This method is used to initialize geo bypass mechanism based on faking
530 X-Forwarded-For HTTP header. A random country from provided country list
531 is selected and a random IP belonging to this country is generated. This
532 IP will be passed as X-Forwarded-For HTTP header in all subsequent
533 HTTP requests.
534
535 This method will be used for initial geo bypass mechanism initialization
536 during the instance initialization with _GEO_COUNTRIES and
537 _GEO_IP_BLOCKS.
538
539 You may also manually call it from extractor's code if geo bypass
540 information is not available beforehand (e.g. obtained during
541 extraction) or due to some other reason. In this case you should pass
542 this information in geo bypass context passed as first argument. It may
543 contain following fields:
544
545 countries: List of geo unrestricted countries (similar
546 to _GEO_COUNTRIES)
547 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
548 (similar to _GEO_IP_BLOCKS)
549
550 """
551 if not self._x_forwarded_for_ip:
552
553 # Geo bypass mechanism is explicitly disabled by user
554 if not self.get_param('geo_bypass', True):
555 return
556
557 if not geo_bypass_context:
558 geo_bypass_context = {}
559
560 # Backward compatibility: previously _initialize_geo_bypass
561 # expected a list of countries, some 3rd party code may still use
562 # it this way
563 if isinstance(geo_bypass_context, (list, tuple)):
564 geo_bypass_context = {
565 'countries': geo_bypass_context,
566 }
567
568 # The whole point of geo bypass mechanism is to fake IP
569 # as X-Forwarded-For HTTP header based on some IP block or
570 # country code.
571
572 # Path 1: bypassing based on IP block in CIDR notation
573
574 # Explicit IP block specified by user, use it right away
575 # regardless of whether extractor is geo bypassable or not
576 ip_block = self.get_param('geo_bypass_ip_block', None)
577
578 # Otherwise use random IP block from geo bypass context but only
579 # if extractor is known as geo bypassable
580 if not ip_block:
581 ip_blocks = geo_bypass_context.get('ip_blocks')
582 if self._GEO_BYPASS and ip_blocks:
583 ip_block = random.choice(ip_blocks)
584
585 if ip_block:
586 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
587 self._downloader.write_debug(
588 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
589 return
590
591 # Path 2: bypassing based on country code
592
593 # Explicit country code specified by user, use it right away
594 # regardless of whether extractor is geo bypassable or not
595 country = self.get_param('geo_bypass_country', None)
596
597 # Otherwise use random country code from geo bypass context but
598 # only if extractor is known as geo bypassable
599 if not country:
600 countries = geo_bypass_context.get('countries')
601 if self._GEO_BYPASS and countries:
602 country = random.choice(countries)
603
604 if country:
605 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
606 self._downloader.write_debug(
607 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
608
609 def extract(self, url):
610 """Extracts URL information and returns it in list of dicts."""
611 try:
612 for _ in range(2):
613 try:
614 self.initialize()
615 self.write_debug('Extracting URL: %s' % url)
616 ie_result = self._real_extract(url)
617 if ie_result is None:
618 return None
619 if self._x_forwarded_for_ip:
620 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
621 subtitles = ie_result.get('subtitles')
622 if (subtitles and 'live_chat' in subtitles
623 and 'no-live-chat' in self.get_param('compat_opts', [])):
624 del subtitles['live_chat']
625 return ie_result
626 except GeoRestrictedError as e:
627 if self.__maybe_fake_ip_and_retry(e.countries):
628 continue
629 raise
630 except UnsupportedError:
631 raise
632 except ExtractorError as e:
633 kwargs = {
634 'video_id': e.video_id or self.get_temp_id(url),
635 'ie': self.IE_NAME,
636 'tb': e.traceback or sys.exc_info()[2],
637 'expected': e.expected,
638 'cause': e.cause
639 }
640 if hasattr(e, 'countries'):
641 kwargs['countries'] = e.countries
642 raise type(e)(e.msg, **kwargs)
643 except compat_http_client.IncompleteRead as e:
644 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
645 except (KeyError, StopIteration) as e:
646 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
647
648 def __maybe_fake_ip_and_retry(self, countries):
649 if (not self.get_param('geo_bypass_country', None)
650 and self._GEO_BYPASS
651 and self.get_param('geo_bypass', True)
652 and not self._x_forwarded_for_ip
653 and countries):
654 country_code = random.choice(countries)
655 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
656 if self._x_forwarded_for_ip:
657 self.report_warning(
658 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
659 % (self._x_forwarded_for_ip, country_code.upper()))
660 return True
661 return False
662
663 def set_downloader(self, downloader):
664 """Sets the downloader for this IE."""
665 self._downloader = downloader
666
667 def _real_initialize(self):
668 """Real initialization process. Redefine in subclasses."""
669 pass
670
671 def _real_extract(self, url):
672 """Real extraction process. Redefine in subclasses."""
673 pass
674
675 @classmethod
676 def ie_key(cls):
677 """A string for getting the InfoExtractor with get_info_extractor"""
678 return cls.__name__[:-2]
679
680 @property
681 def IE_NAME(self):
682 return compat_str(type(self).__name__[:-2])
683
684 @staticmethod
685 def __can_accept_status_code(err, expected_status):
686 assert isinstance(err, compat_urllib_error.HTTPError)
687 if expected_status is None:
688 return False
689 elif callable(expected_status):
690 return expected_status(err.code) is True
691 else:
692 return err.code in variadic(expected_status)
693
694 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
695 """
696 Return the response handle.
697
698 See _download_webpage docstring for arguments specification.
699 """
700 if not self._downloader._first_webpage_request:
701 sleep_interval = self.get_param('sleep_interval_requests') or 0
702 if sleep_interval > 0:
703 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
704 time.sleep(sleep_interval)
705 else:
706 self._downloader._first_webpage_request = False
707
708 if note is None:
709 self.report_download_webpage(video_id)
710 elif note is not False:
711 if video_id is None:
712 self.to_screen('%s' % (note,))
713 else:
714 self.to_screen('%s: %s' % (video_id, note))
715
716 # Some sites check X-Forwarded-For HTTP header in order to figure out
717 # the origin of the client behind proxy. This allows bypassing geo
718 # restriction by faking this header's value to IP that belongs to some
719 # geo unrestricted country. We will do so once we encounter any
720 # geo restriction error.
721 if self._x_forwarded_for_ip:
722 if 'X-Forwarded-For' not in headers:
723 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
724
725 if isinstance(url_or_request, compat_urllib_request.Request):
726 url_or_request = update_Request(
727 url_or_request, data=data, headers=headers, query=query)
728 else:
729 if query:
730 url_or_request = update_url_query(url_or_request, query)
731 if data is not None or headers:
732 url_or_request = sanitized_Request(url_or_request, data, headers)
733 try:
734 return self._downloader.urlopen(url_or_request)
735 except network_exceptions as err:
736 if isinstance(err, compat_urllib_error.HTTPError):
737 if self.__can_accept_status_code(err, expected_status):
738 # Retain reference to error to prevent file object from
739 # being closed before it can be read. Works around the
740 # effects of <https://bugs.python.org/issue15002>
741 # introduced in Python 3.4.1.
742 err.fp._error = err
743 return err.fp
744
745 if errnote is False:
746 return False
747 if errnote is None:
748 errnote = 'Unable to download webpage'
749
750 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
751 if fatal:
752 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
753 else:
754 self.report_warning(errmsg)
755 return False
756
757 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
758 """
759 Return a tuple (page content as string, URL handle).
760
761 See _download_webpage docstring for arguments specification.
762 """
763 # Strip hashes from the URL (#1038)
764 if isinstance(url_or_request, (compat_str, str)):
765 url_or_request = url_or_request.partition('#')[0]
766
767 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
768 if urlh is False:
769 assert not fatal
770 return False
771 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
772 return (content, urlh)
773
774 @staticmethod
775 def _guess_encoding_from_content(content_type, webpage_bytes):
776 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
777 if m:
778 encoding = m.group(1)
779 else:
780 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
781 webpage_bytes[:1024])
782 if m:
783 encoding = m.group(1).decode('ascii')
784 elif webpage_bytes.startswith(b'\xff\xfe'):
785 encoding = 'utf-16'
786 else:
787 encoding = 'utf-8'
788
789 return encoding
790
791 def __check_blocked(self, content):
792 first_block = content[:512]
793 if ('<title>Access to this site is blocked</title>' in content
794 and 'Websense' in first_block):
795 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
796 blocked_iframe = self._html_search_regex(
797 r'<iframe src="([^"]+)"', content,
798 'Websense information URL', default=None)
799 if blocked_iframe:
800 msg += ' Visit %s for more details' % blocked_iframe
801 raise ExtractorError(msg, expected=True)
802 if '<title>The URL you requested has been blocked</title>' in first_block:
803 msg = (
804 'Access to this webpage has been blocked by Indian censorship. '
805 'Use a VPN or proxy server (with --proxy) to route around it.')
806 block_msg = self._html_search_regex(
807 r'</h1><p>(.*?)</p>',
808 content, 'block message', default=None)
809 if block_msg:
810 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
811 raise ExtractorError(msg, expected=True)
812 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
813 and 'blocklist.rkn.gov.ru' in content):
814 raise ExtractorError(
815 'Access to this webpage has been blocked by decision of the Russian government. '
816 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
817 expected=True)
818
819 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
820 content_type = urlh.headers.get('Content-Type', '')
821 webpage_bytes = urlh.read()
822 if prefix is not None:
823 webpage_bytes = prefix + webpage_bytes
824 if not encoding:
825 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
826 if self.get_param('dump_intermediate_pages', False):
827 self.to_screen('Dumping request to ' + urlh.geturl())
828 dump = base64.b64encode(webpage_bytes).decode('ascii')
829 self._downloader.to_screen(dump)
830 if self.get_param('write_pages', False):
831 basen = '%s_%s' % (video_id, urlh.geturl())
832 trim_length = self.get_param('trim_file_name') or 240
833 if len(basen) > trim_length:
834 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
835 basen = basen[:trim_length - len(h)] + h
836 raw_filename = basen + '.dump'
837 filename = sanitize_filename(raw_filename, restricted=True)
838 self.to_screen('Saving request to ' + filename)
839 # Working around MAX_PATH limitation on Windows (see
840 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
841 if compat_os_name == 'nt':
842 absfilepath = os.path.abspath(filename)
843 if len(absfilepath) > 259:
844 filename = '\\\\?\\' + absfilepath
845 with open(filename, 'wb') as outf:
846 outf.write(webpage_bytes)
847
848 try:
849 content = webpage_bytes.decode(encoding, 'replace')
850 except LookupError:
851 content = webpage_bytes.decode('utf-8', 'replace')
852
853 self.__check_blocked(content)
854
855 return content
856
857 def _download_webpage(
858 self, url_or_request, video_id, note=None, errnote=None,
859 fatal=True, tries=1, timeout=5, encoding=None, data=None,
860 headers={}, query={}, expected_status=None):
861 """
862 Return the data of the page as a string.
863
864 Arguments:
865 url_or_request -- plain text URL as a string or
866 a compat_urllib_request.Requestobject
867 video_id -- Video/playlist/item identifier (string)
868
869 Keyword arguments:
870 note -- note printed before downloading (string)
871 errnote -- note printed in case of an error (string)
872 fatal -- flag denoting whether error should be considered fatal,
873 i.e. whether it should cause ExtractionError to be raised,
874 otherwise a warning will be reported and extraction continued
875 tries -- number of tries
876 timeout -- sleep interval between tries
877 encoding -- encoding for a page content decoding, guessed automatically
878 when not explicitly specified
879 data -- POST data (bytes)
880 headers -- HTTP headers (dict)
881 query -- URL query (dict)
882 expected_status -- allows to accept failed HTTP requests (non 2xx
883 status code) by explicitly specifying a set of accepted status
884 codes. Can be any of the following entities:
885 - an integer type specifying an exact failed status code to
886 accept
887 - a list or a tuple of integer types specifying a list of
888 failed status codes to accept
889 - a callable accepting an actual failed status code and
890 returning True if it should be accepted
891 Note that this argument does not affect success status codes (2xx)
892 which are always accepted.
893 """
894
895 success = False
896 try_count = 0
897 while success is False:
898 try:
899 res = self._download_webpage_handle(
900 url_or_request, video_id, note, errnote, fatal,
901 encoding=encoding, data=data, headers=headers, query=query,
902 expected_status=expected_status)
903 success = True
904 except compat_http_client.IncompleteRead as e:
905 try_count += 1
906 if try_count >= tries:
907 raise e
908 self._sleep(timeout, video_id)
909 if res is False:
910 return res
911 else:
912 content, _ = res
913 return content
914
915 def _download_xml_handle(
916 self, url_or_request, video_id, note='Downloading XML',
917 errnote='Unable to download XML', transform_source=None,
918 fatal=True, encoding=None, data=None, headers={}, query={},
919 expected_status=None):
920 """
921 Return a tuple (xml as an compat_etree_Element, URL handle).
922
923 See _download_webpage docstring for arguments specification.
924 """
925 res = self._download_webpage_handle(
926 url_or_request, video_id, note, errnote, fatal=fatal,
927 encoding=encoding, data=data, headers=headers, query=query,
928 expected_status=expected_status)
929 if res is False:
930 return res
931 xml_string, urlh = res
932 return self._parse_xml(
933 xml_string, video_id, transform_source=transform_source,
934 fatal=fatal), urlh
935
936 def _download_xml(
937 self, url_or_request, video_id,
938 note='Downloading XML', errnote='Unable to download XML',
939 transform_source=None, fatal=True, encoding=None,
940 data=None, headers={}, query={}, expected_status=None):
941 """
942 Return the xml as an compat_etree_Element.
943
944 See _download_webpage docstring for arguments specification.
945 """
946 res = self._download_xml_handle(
947 url_or_request, video_id, note=note, errnote=errnote,
948 transform_source=transform_source, fatal=fatal, encoding=encoding,
949 data=data, headers=headers, query=query,
950 expected_status=expected_status)
951 return res if res is False else res[0]
952
953 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
954 if transform_source:
955 xml_string = transform_source(xml_string)
956 try:
957 return compat_etree_fromstring(xml_string.encode('utf-8'))
958 except compat_xml_parse_error as ve:
959 errmsg = '%s: Failed to parse XML ' % video_id
960 if fatal:
961 raise ExtractorError(errmsg, cause=ve)
962 else:
963 self.report_warning(errmsg + str(ve))
964
965 def _download_json_handle(
966 self, url_or_request, video_id, note='Downloading JSON metadata',
967 errnote='Unable to download JSON metadata', transform_source=None,
968 fatal=True, encoding=None, data=None, headers={}, query={},
969 expected_status=None):
970 """
971 Return a tuple (JSON object, URL handle).
972
973 See _download_webpage docstring for arguments specification.
974 """
975 res = self._download_webpage_handle(
976 url_or_request, video_id, note, errnote, fatal=fatal,
977 encoding=encoding, data=data, headers=headers, query=query,
978 expected_status=expected_status)
979 if res is False:
980 return res
981 json_string, urlh = res
982 return self._parse_json(
983 json_string, video_id, transform_source=transform_source,
984 fatal=fatal), urlh
985
986 def _download_json(
987 self, url_or_request, video_id, note='Downloading JSON metadata',
988 errnote='Unable to download JSON metadata', transform_source=None,
989 fatal=True, encoding=None, data=None, headers={}, query={},
990 expected_status=None):
991 """
992 Return the JSON object as a dict.
993
994 See _download_webpage docstring for arguments specification.
995 """
996 res = self._download_json_handle(
997 url_or_request, video_id, note=note, errnote=errnote,
998 transform_source=transform_source, fatal=fatal, encoding=encoding,
999 data=data, headers=headers, query=query,
1000 expected_status=expected_status)
1001 return res if res is False else res[0]
1002
1003 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
1004 if transform_source:
1005 json_string = transform_source(json_string)
1006 try:
1007 return json.loads(json_string)
1008 except ValueError as ve:
1009 errmsg = '%s: Failed to parse JSON ' % video_id
1010 if fatal:
1011 raise ExtractorError(errmsg, cause=ve)
1012 else:
1013 self.report_warning(errmsg + str(ve))
1014
1015 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1016 return self._parse_json(
1017 data[data.find('{'):data.rfind('}') + 1],
1018 video_id, transform_source, fatal)
1019
1020 def _download_socket_json_handle(
1021 self, url_or_request, video_id, note='Polling socket',
1022 errnote='Unable to poll socket', transform_source=None,
1023 fatal=True, encoding=None, data=None, headers={}, query={},
1024 expected_status=None):
1025 """
1026 Return a tuple (JSON object, URL handle).
1027
1028 See _download_webpage docstring for arguments specification.
1029 """
1030 res = self._download_webpage_handle(
1031 url_or_request, video_id, note, errnote, fatal=fatal,
1032 encoding=encoding, data=data, headers=headers, query=query,
1033 expected_status=expected_status)
1034 if res is False:
1035 return res
1036 webpage, urlh = res
1037 return self._parse_socket_response_as_json(
1038 webpage, video_id, transform_source=transform_source,
1039 fatal=fatal), urlh
1040
1041 def _download_socket_json(
1042 self, url_or_request, video_id, note='Polling socket',
1043 errnote='Unable to poll socket', transform_source=None,
1044 fatal=True, encoding=None, data=None, headers={}, query={},
1045 expected_status=None):
1046 """
1047 Return the JSON object as a dict.
1048
1049 See _download_webpage docstring for arguments specification.
1050 """
1051 res = self._download_socket_json_handle(
1052 url_or_request, video_id, note=note, errnote=errnote,
1053 transform_source=transform_source, fatal=fatal, encoding=encoding,
1054 data=data, headers=headers, query=query,
1055 expected_status=expected_status)
1056 return res if res is False else res[0]
1057
1058 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1059 idstr = format_field(video_id, template='%s: ')
1060 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1061 if only_once:
1062 if f'WARNING: {msg}' in self._printed_messages:
1063 return
1064 self._printed_messages.add(f'WARNING: {msg}')
1065 self._downloader.report_warning(msg, *args, **kwargs)
1066
1067 def to_screen(self, msg, *args, **kwargs):
1068 """Print msg to screen, prefixing it with '[ie_name]'"""
1069 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1070
1071 def write_debug(self, msg, *args, **kwargs):
1072 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1073
1074 def get_param(self, name, default=None, *args, **kwargs):
1075 if self._downloader:
1076 return self._downloader.params.get(name, default, *args, **kwargs)
1077 return default
1078
1079 def report_drm(self, video_id, partial=False):
1080 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1081
1082 def report_extraction(self, id_or_name):
1083 """Report information extraction."""
1084 self.to_screen('%s: Extracting information' % id_or_name)
1085
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self.to_screen('%s: Downloading webpage' % video_id)
1089
1090 def report_age_confirmation(self):
1091 """Report attempt to confirm age."""
1092 self.to_screen('Confirming age')
1093
1094 def report_login(self):
1095 """Report attempt to log in."""
1096 self.to_screen('Logging in')
1097
1098 def raise_login_required(
1099 self, msg='This video is only available for registered users',
1100 metadata_available=False, method='any'):
1101 if metadata_available and (
1102 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1103 self.report_warning(msg)
1104 if method is not None:
1105 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1106 raise ExtractorError(msg, expected=True)
1107
1108 def raise_geo_restricted(
1109 self, msg='This video is not available from your location due to geo restriction',
1110 countries=None, metadata_available=False):
1111 if metadata_available and (
1112 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1113 self.report_warning(msg)
1114 else:
1115 raise GeoRestrictedError(msg, countries=countries)
1116
1117 def raise_no_formats(self, msg, expected=False, video_id=None):
1118 if expected and (
1119 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1120 self.report_warning(msg, video_id)
1121 elif isinstance(msg, ExtractorError):
1122 raise msg
1123 else:
1124 raise ExtractorError(msg, expected=expected, video_id=video_id)
1125
1126 # Methods for following #608
1127 @staticmethod
1128 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1129 """Returns a URL that points to a page that should be processed"""
1130 if ie is not None:
1131 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1132 if video_id is not None:
1133 kwargs['id'] = video_id
1134 if video_title is not None:
1135 kwargs['title'] = video_title
1136 return {
1137 **kwargs,
1138 '_type': 'url_transparent' if url_transparent else 'url',
1139 'url': url,
1140 }
1141
1142 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
1143 urls = (self.url_result(self._proto_relative_url(m), ie)
1144 for m in orderedSet(map(getter, matches) if getter else matches))
1145 return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
1146
1147 @staticmethod
1148 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1149 """Returns a playlist"""
1150 if playlist_id:
1151 kwargs['id'] = playlist_id
1152 if playlist_title:
1153 kwargs['title'] = playlist_title
1154 if playlist_description is not None:
1155 kwargs['description'] = playlist_description
1156 return {
1157 **kwargs,
1158 '_type': 'multi_video' if multi_video else 'playlist',
1159 'entries': entries,
1160 }
1161
1162 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1163 """
1164 Perform a regex search on the given string, using a single or a list of
1165 patterns returning the first matching group.
1166 In case of failure return a default value or raise a WARNING or a
1167 RegexNotFoundError, depending on fatal, specifying the field name.
1168 """
1169 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1170 mobj = re.search(pattern, string, flags)
1171 else:
1172 for p in pattern:
1173 mobj = re.search(p, string, flags)
1174 if mobj:
1175 break
1176
1177 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1178
1179 if mobj:
1180 if group is None:
1181 # return the first matching group
1182 return next(g for g in mobj.groups() if g is not None)
1183 elif isinstance(group, (list, tuple)):
1184 return tuple(mobj.group(g) for g in group)
1185 else:
1186 return mobj.group(group)
1187 elif default is not NO_DEFAULT:
1188 return default
1189 elif fatal:
1190 raise RegexNotFoundError('Unable to extract %s' % _name)
1191 else:
1192 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1193 return None
1194
1195 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1196 """
1197 Like _search_regex, but strips HTML tags and unescapes entities.
1198 """
1199 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1200 if res:
1201 return clean_html(res).strip()
1202 else:
1203 return res
1204
1205 def _get_netrc_login_info(self, netrc_machine=None):
1206 username = None
1207 password = None
1208 netrc_machine = netrc_machine or self._NETRC_MACHINE
1209
1210 if self.get_param('usenetrc', False):
1211 try:
1212 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1213 if os.path.isdir(netrc_file):
1214 netrc_file = os.path.join(netrc_file, '.netrc')
1215 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1216 if info is not None:
1217 username = info[0]
1218 password = info[2]
1219 else:
1220 raise netrc.NetrcParseError(
1221 'No authenticators for %s' % netrc_machine)
1222 except (IOError, netrc.NetrcParseError) as err:
1223 self.report_warning(
1224 'parsing .netrc: %s' % error_to_compat_str(err))
1225
1226 return username, password
1227
1228 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1229 """
1230 Get the login info as (username, password)
1231 First look for the manually specified credentials using username_option
1232 and password_option as keys in params dictionary. If no such credentials
1233 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1234 value.
1235 If there's no info available, return (None, None)
1236 """
1237
1238 # Attempt to use provided username and password or .netrc data
1239 username = self.get_param(username_option)
1240 if username is not None:
1241 password = self.get_param(password_option)
1242 else:
1243 username, password = self._get_netrc_login_info(netrc_machine)
1244
1245 return username, password
1246
1247 def _get_tfa_info(self, note='two-factor verification code'):
1248 """
1249 Get the two-factor authentication info
1250 TODO - asking the user will be required for sms/phone verify
1251 currently just uses the command line option
1252 If there's no info available, return None
1253 """
1254
1255 tfa = self.get_param('twofactor')
1256 if tfa is not None:
1257 return tfa
1258
1259 return compat_getpass('Type %s and press [Return]: ' % note)
1260
1261 # Helper functions for extracting OpenGraph info
1262 @staticmethod
1263 def _og_regexes(prop):
1264 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1265 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1266 % {'prop': re.escape(prop)})
1267 template = r'<meta[^>]+?%s[^>]+?%s'
1268 return [
1269 template % (property_re, content_re),
1270 template % (content_re, property_re),
1271 ]
1272
1273 @staticmethod
1274 def _meta_regex(prop):
1275 return r'''(?isx)<meta
1276 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1277 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1278
1279 def _og_search_property(self, prop, html, name=None, **kargs):
1280 prop = variadic(prop)
1281 if name is None:
1282 name = 'OpenGraph %s' % prop[0]
1283 og_regexes = []
1284 for p in prop:
1285 og_regexes.extend(self._og_regexes(p))
1286 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1287 if escaped is None:
1288 return None
1289 return unescapeHTML(escaped)
1290
1291 def _og_search_thumbnail(self, html, **kargs):
1292 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1293
1294 def _og_search_description(self, html, **kargs):
1295 return self._og_search_property('description', html, fatal=False, **kargs)
1296
1297 def _og_search_title(self, html, **kargs):
1298 kargs.setdefault('fatal', False)
1299 return self._og_search_property('title', html, **kargs)
1300
1301 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1302 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1303 if secure:
1304 regexes = self._og_regexes('video:secure_url') + regexes
1305 return self._html_search_regex(regexes, html, name, **kargs)
1306
1307 def _og_search_url(self, html, **kargs):
1308 return self._og_search_property('url', html, **kargs)
1309
1310 def _html_extract_title(self, html, name, **kwargs):
1311 return self._html_search_regex(
1312 r'(?s)<title>(.*?)</title>', html, name, **kwargs)
1313
1314 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1315 name = variadic(name)
1316 if display_name is None:
1317 display_name = name[0]
1318 return self._html_search_regex(
1319 [self._meta_regex(n) for n in name],
1320 html, display_name, fatal=fatal, group='content', **kwargs)
1321
1322 def _dc_search_uploader(self, html):
1323 return self._html_search_meta('dc.creator', html, 'uploader')
1324
1325 def _rta_search(self, html):
1326 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1327 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1328 r' content="RTA-5042-1996-1400-1577-RTA"',
1329 html):
1330 return 18
1331 return 0
1332
1333 def _media_rating_search(self, html):
1334 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1335 rating = self._html_search_meta('rating', html)
1336
1337 if not rating:
1338 return None
1339
1340 RATING_TABLE = {
1341 'safe for kids': 0,
1342 'general': 8,
1343 '14 years': 14,
1344 'mature': 17,
1345 'restricted': 19,
1346 }
1347 return RATING_TABLE.get(rating.lower())
1348
1349 def _family_friendly_search(self, html):
1350 # See http://schema.org/VideoObject
1351 family_friendly = self._html_search_meta(
1352 'isFamilyFriendly', html, default=None)
1353
1354 if not family_friendly:
1355 return None
1356
1357 RATING_TABLE = {
1358 '1': 0,
1359 'true': 0,
1360 '0': 18,
1361 'false': 18,
1362 }
1363 return RATING_TABLE.get(family_friendly.lower())
1364
1365 def _twitter_search_player(self, html):
1366 return self._html_search_meta('twitter:player', html,
1367 'twitter card player')
1368
1369 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1370 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1371 default = kwargs.get('default', NO_DEFAULT)
1372 # JSON-LD may be malformed and thus `fatal` should be respected.
1373 # At the same time `default` may be passed that assumes `fatal=False`
1374 # for _search_regex. Let's simulate the same behavior here as well.
1375 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1376 json_ld = []
1377 for mobj in json_ld_list:
1378 json_ld_item = self._parse_json(
1379 mobj.group('json_ld'), video_id, fatal=fatal)
1380 if not json_ld_item:
1381 continue
1382 if isinstance(json_ld_item, dict):
1383 json_ld.append(json_ld_item)
1384 elif isinstance(json_ld_item, (list, tuple)):
1385 json_ld.extend(json_ld_item)
1386 if json_ld:
1387 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1388 if json_ld:
1389 return json_ld
1390 if default is not NO_DEFAULT:
1391 return default
1392 elif fatal:
1393 raise RegexNotFoundError('Unable to extract JSON-LD')
1394 else:
1395 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1396 return {}
1397
1398 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1399 if isinstance(json_ld, compat_str):
1400 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1401 if not json_ld:
1402 return {}
1403 info = {}
1404 if not isinstance(json_ld, (list, tuple, dict)):
1405 return info
1406 if isinstance(json_ld, dict):
1407 json_ld = [json_ld]
1408
1409 INTERACTION_TYPE_MAP = {
1410 'CommentAction': 'comment',
1411 'AgreeAction': 'like',
1412 'DisagreeAction': 'dislike',
1413 'LikeAction': 'like',
1414 'DislikeAction': 'dislike',
1415 'ListenAction': 'view',
1416 'WatchAction': 'view',
1417 'ViewAction': 'view',
1418 }
1419
1420 def extract_interaction_type(e):
1421 interaction_type = e.get('interactionType')
1422 if isinstance(interaction_type, dict):
1423 interaction_type = interaction_type.get('@type')
1424 return str_or_none(interaction_type)
1425
1426 def extract_interaction_statistic(e):
1427 interaction_statistic = e.get('interactionStatistic')
1428 if isinstance(interaction_statistic, dict):
1429 interaction_statistic = [interaction_statistic]
1430 if not isinstance(interaction_statistic, list):
1431 return
1432 for is_e in interaction_statistic:
1433 if not isinstance(is_e, dict):
1434 continue
1435 if is_e.get('@type') != 'InteractionCounter':
1436 continue
1437 interaction_type = extract_interaction_type(is_e)
1438 if not interaction_type:
1439 continue
1440 # For interaction count some sites provide string instead of
1441 # an integer (as per spec) with non digit characters (e.g. ",")
1442 # so extracting count with more relaxed str_to_int
1443 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1444 if interaction_count is None:
1445 continue
1446 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1447 if not count_kind:
1448 continue
1449 count_key = '%s_count' % count_kind
1450 if info.get(count_key) is not None:
1451 continue
1452 info[count_key] = interaction_count
1453
1454 def extract_chapter_information(e):
1455 chapters = [{
1456 'title': part.get('name'),
1457 'start_time': part.get('startOffset'),
1458 'end_time': part.get('endOffset'),
1459 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1460 for idx, (last_c, current_c, next_c) in enumerate(zip(
1461 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1462 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1463 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1464 if None in current_c.values():
1465 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1466 return
1467 if chapters:
1468 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1469 info['chapters'] = chapters
1470
1471 def extract_video_object(e):
1472 assert e['@type'] == 'VideoObject'
1473 author = e.get('author')
1474 info.update({
1475 'url': url_or_none(e.get('contentUrl')),
1476 'title': unescapeHTML(e.get('name')),
1477 'description': unescapeHTML(e.get('description')),
1478 'thumbnails': [{'url': url_or_none(url)}
1479 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1480 'duration': parse_duration(e.get('duration')),
1481 'timestamp': unified_timestamp(e.get('uploadDate')),
1482 # author can be an instance of 'Organization' or 'Person' types.
1483 # both types can have 'name' property(inherited from 'Thing' type). [1]
1484 # however some websites are using 'Text' type instead.
1485 # 1. https://schema.org/VideoObject
1486 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1487 'filesize': float_or_none(e.get('contentSize')),
1488 'tbr': int_or_none(e.get('bitrate')),
1489 'width': int_or_none(e.get('width')),
1490 'height': int_or_none(e.get('height')),
1491 'view_count': int_or_none(e.get('interactionCount')),
1492 })
1493 extract_interaction_statistic(e)
1494 extract_chapter_information(e)
1495
1496 def traverse_json_ld(json_ld, at_top_level=True):
1497 for e in json_ld:
1498 if at_top_level and '@context' not in e:
1499 continue
1500 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1501 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1502 break
1503 item_type = e.get('@type')
1504 if expected_type is not None and expected_type != item_type:
1505 continue
1506 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1507 if rating is not None:
1508 info['average_rating'] = rating
1509 if item_type in ('TVEpisode', 'Episode'):
1510 episode_name = unescapeHTML(e.get('name'))
1511 info.update({
1512 'episode': episode_name,
1513 'episode_number': int_or_none(e.get('episodeNumber')),
1514 'description': unescapeHTML(e.get('description')),
1515 })
1516 if not info.get('title') and episode_name:
1517 info['title'] = episode_name
1518 part_of_season = e.get('partOfSeason')
1519 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1520 info.update({
1521 'season': unescapeHTML(part_of_season.get('name')),
1522 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1523 })
1524 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1525 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1526 info['series'] = unescapeHTML(part_of_series.get('name'))
1527 elif item_type == 'Movie':
1528 info.update({
1529 'title': unescapeHTML(e.get('name')),
1530 'description': unescapeHTML(e.get('description')),
1531 'duration': parse_duration(e.get('duration')),
1532 'timestamp': unified_timestamp(e.get('dateCreated')),
1533 })
1534 elif item_type in ('Article', 'NewsArticle'):
1535 info.update({
1536 'timestamp': parse_iso8601(e.get('datePublished')),
1537 'title': unescapeHTML(e.get('headline')),
1538 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1539 })
1540 if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1541 extract_video_object(e['video'][0])
1542 elif item_type == 'VideoObject':
1543 extract_video_object(e)
1544 if expected_type is None:
1545 continue
1546 else:
1547 break
1548 video = e.get('video')
1549 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1550 extract_video_object(video)
1551 if expected_type is None:
1552 continue
1553 else:
1554 break
1555 traverse_json_ld(json_ld)
1556
1557 return dict((k, v) for k, v in info.items() if v is not None)
1558
1559 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1560 return self._parse_json(
1561 self._search_regex(
1562 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1563 webpage, 'next.js data', fatal=fatal, **kw),
1564 video_id, transform_source=transform_source, fatal=fatal)
1565
1566 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1567 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1568 # not all website do this, but it can be changed
1569 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1570 rectx = re.escape(context_name)
1571 js, arg_keys, arg_vals = self._search_regex(
1572 (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1573 r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1574 webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1575
1576 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1577
1578 for key, val in args.items():
1579 if val in ('undefined', 'void 0'):
1580 args[key] = 'null'
1581
1582 return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1583
1584 @staticmethod
1585 def _hidden_inputs(html):
1586 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1587 hidden_inputs = {}
1588 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1589 attrs = extract_attributes(input)
1590 if not input:
1591 continue
1592 if attrs.get('type') not in ('hidden', 'submit'):
1593 continue
1594 name = attrs.get('name') or attrs.get('id')
1595 value = attrs.get('value')
1596 if name and value is not None:
1597 hidden_inputs[name] = value
1598 return hidden_inputs
1599
1600 def _form_hidden_inputs(self, form_id, html):
1601 form = self._search_regex(
1602 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1603 html, '%s form' % form_id, group='form')
1604 return self._hidden_inputs(form)
1605
1606 class FormatSort:
1607 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1608
1609 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1610 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1611 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1612 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1613 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1614 'fps', 'fs_approx', 'source', 'id')
1615
1616 settings = {
1617 'vcodec': {'type': 'ordered', 'regex': True,
1618 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1619 'acodec': {'type': 'ordered', 'regex': True,
1620 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1621 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1622 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1623 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1624 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1625 'vext': {'type': 'ordered', 'field': 'video_ext',
1626 'order': ('mp4', 'webm', 'flv', '', 'none'),
1627 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1628 'aext': {'type': 'ordered', 'field': 'audio_ext',
1629 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1630 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1631 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1632 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1633 'field': ('vcodec', 'acodec'),
1634 'function': lambda it: int(any(v != 'none' for v in it))},
1635 'ie_pref': {'priority': True, 'type': 'extractor'},
1636 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1637 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1638 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1639 'quality': {'convert': 'float', 'default': -1},
1640 'filesize': {'convert': 'bytes'},
1641 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1642 'id': {'convert': 'string', 'field': 'format_id'},
1643 'height': {'convert': 'float_none'},
1644 'width': {'convert': 'float_none'},
1645 'fps': {'convert': 'float_none'},
1646 'tbr': {'convert': 'float_none'},
1647 'vbr': {'convert': 'float_none'},
1648 'abr': {'convert': 'float_none'},
1649 'asr': {'convert': 'float_none'},
1650 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1651
1652 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1653 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1654 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1655 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1656 'res': {'type': 'multiple', 'field': ('height', 'width'),
1657 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1658
1659 # For compatibility with youtube-dl
1660 'format_id': {'type': 'alias', 'field': 'id'},
1661 'preference': {'type': 'alias', 'field': 'ie_pref'},
1662 'language_preference': {'type': 'alias', 'field': 'lang'},
1663
1664 # Deprecated
1665 'dimension': {'type': 'alias', 'field': 'res'},
1666 'resolution': {'type': 'alias', 'field': 'res'},
1667 'extension': {'type': 'alias', 'field': 'ext'},
1668 'bitrate': {'type': 'alias', 'field': 'br'},
1669 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1670 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1671 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1672 'framerate': {'type': 'alias', 'field': 'fps'},
1673 'protocol': {'type': 'alias', 'field': 'proto'},
1674 'source_preference': {'type': 'alias', 'field': 'source'},
1675 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1676 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1677 'samplerate': {'type': 'alias', 'field': 'asr'},
1678 'video_ext': {'type': 'alias', 'field': 'vext'},
1679 'audio_ext': {'type': 'alias', 'field': 'aext'},
1680 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1681 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1682 'video': {'type': 'alias', 'field': 'hasvid'},
1683 'has_video': {'type': 'alias', 'field': 'hasvid'},
1684 'audio': {'type': 'alias', 'field': 'hasaud'},
1685 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1686 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1687 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1688 }
1689
1690 def __init__(self, ie, field_preference):
1691 self._order = []
1692 self.ydl = ie._downloader
1693 self.evaluate_params(self.ydl.params, field_preference)
1694 if ie.get_param('verbose'):
1695 self.print_verbose_info(self.ydl.write_debug)
1696
1697 def _get_field_setting(self, field, key):
1698 if field not in self.settings:
1699 if key in ('forced', 'priority'):
1700 return False
1701 self.ydl.deprecation_warning(
1702 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1703 'and may be removed in a future version')
1704 self.settings[field] = {}
1705 propObj = self.settings[field]
1706 if key not in propObj:
1707 type = propObj.get('type')
1708 if key == 'field':
1709 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1710 elif key == 'convert':
1711 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1712 else:
1713 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1714 propObj[key] = default
1715 return propObj[key]
1716
1717 def _resolve_field_value(self, field, value, convertNone=False):
1718 if value is None:
1719 if not convertNone:
1720 return None
1721 else:
1722 value = value.lower()
1723 conversion = self._get_field_setting(field, 'convert')
1724 if conversion == 'ignore':
1725 return None
1726 if conversion == 'string':
1727 return value
1728 elif conversion == 'float_none':
1729 return float_or_none(value)
1730 elif conversion == 'bytes':
1731 return FileDownloader.parse_bytes(value)
1732 elif conversion == 'order':
1733 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1734 use_regex = self._get_field_setting(field, 'regex')
1735 list_length = len(order_list)
1736 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1737 if use_regex and value is not None:
1738 for i, regex in enumerate(order_list):
1739 if regex and re.match(regex, value):
1740 return list_length - i
1741 return list_length - empty_pos # not in list
1742 else: # not regex or value = None
1743 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1744 else:
1745 if value.isnumeric():
1746 return float(value)
1747 else:
1748 self.settings[field]['convert'] = 'string'
1749 return value
1750
1751 def evaluate_params(self, params, sort_extractor):
1752 self._use_free_order = params.get('prefer_free_formats', False)
1753 self._sort_user = params.get('format_sort', [])
1754 self._sort_extractor = sort_extractor
1755
1756 def add_item(field, reverse, closest, limit_text):
1757 field = field.lower()
1758 if field in self._order:
1759 return
1760 self._order.append(field)
1761 limit = self._resolve_field_value(field, limit_text)
1762 data = {
1763 'reverse': reverse,
1764 'closest': False if limit is None else closest,
1765 'limit_text': limit_text,
1766 'limit': limit}
1767 if field in self.settings:
1768 self.settings[field].update(data)
1769 else:
1770 self.settings[field] = data
1771
1772 sort_list = (
1773 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1774 + (tuple() if params.get('format_sort_force', False)
1775 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1776 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1777
1778 for item in sort_list:
1779 match = re.match(self.regex, item)
1780 if match is None:
1781 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1782 field = match.group('field')
1783 if field is None:
1784 continue
1785 if self._get_field_setting(field, 'type') == 'alias':
1786 alias, field = field, self._get_field_setting(field, 'field')
1787 if alias not in ('format_id', 'preference', 'language_preference'):
1788 self.ydl.deprecation_warning(
1789 f'Format sorting alias {alias} is deprecated '
1790 f'and may be removed in a future version. Please use {field} instead')
1791 reverse = match.group('reverse') is not None
1792 closest = match.group('separator') == '~'
1793 limit_text = match.group('limit')
1794
1795 has_limit = limit_text is not None
1796 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1797 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1798
1799 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1800 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1801 limit_count = len(limits)
1802 for (i, f) in enumerate(fields):
1803 add_item(f, reverse, closest,
1804 limits[i] if i < limit_count
1805 else limits[0] if has_limit and not has_multiple_limits
1806 else None)
1807
1808 def print_verbose_info(self, write_debug):
1809 if self._sort_user:
1810 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1811 if self._sort_extractor:
1812 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1813 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1814 '+' if self._get_field_setting(field, 'reverse') else '', field,
1815 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1816 self._get_field_setting(field, 'limit_text'),
1817 self._get_field_setting(field, 'limit'))
1818 if self._get_field_setting(field, 'limit_text') is not None else '')
1819 for field in self._order if self._get_field_setting(field, 'visible')]))
1820
1821 def _calculate_field_preference_from_value(self, format, field, type, value):
1822 reverse = self._get_field_setting(field, 'reverse')
1823 closest = self._get_field_setting(field, 'closest')
1824 limit = self._get_field_setting(field, 'limit')
1825
1826 if type == 'extractor':
1827 maximum = self._get_field_setting(field, 'max')
1828 if value is None or (maximum is not None and value >= maximum):
1829 value = -1
1830 elif type == 'boolean':
1831 in_list = self._get_field_setting(field, 'in_list')
1832 not_in_list = self._get_field_setting(field, 'not_in_list')
1833 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1834 elif type == 'ordered':
1835 value = self._resolve_field_value(field, value, True)
1836
1837 # try to convert to number
1838 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1839 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1840 if is_num:
1841 value = val_num
1842
1843 return ((-10, 0) if value is None
1844 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1845 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1846 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1847 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1848 else (-1, value, 0))
1849
1850 def _calculate_field_preference(self, format, field):
1851 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1852 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1853 if type == 'multiple':
1854 type = 'field' # Only 'field' is allowed in multiple for now
1855 actual_fields = self._get_field_setting(field, 'field')
1856
1857 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1858 else:
1859 value = get_value(field)
1860 return self._calculate_field_preference_from_value(format, field, type, value)
1861
1862 def calculate_preference(self, format):
1863 # Determine missing protocol
1864 if not format.get('protocol'):
1865 format['protocol'] = determine_protocol(format)
1866
1867 # Determine missing ext
1868 if not format.get('ext') and 'url' in format:
1869 format['ext'] = determine_ext(format['url'])
1870 if format.get('vcodec') == 'none':
1871 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1872 format['video_ext'] = 'none'
1873 else:
1874 format['video_ext'] = format['ext']
1875 format['audio_ext'] = 'none'
1876 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1877 # format['preference'] = -1000
1878
1879 # Determine missing bitrates
1880 if format.get('tbr') is None:
1881 if format.get('vbr') is not None and format.get('abr') is not None:
1882 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1883 else:
1884 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1885 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1886 if format.get('acodec') != 'none' and format.get('abr') is None:
1887 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1888
1889 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1890
1891 def _sort_formats(self, formats, field_preference=[]):
1892 if not formats:
1893 return
1894 format_sort = self.FormatSort(self, field_preference)
1895 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1896
1897 def _check_formats(self, formats, video_id):
1898 if formats:
1899 formats[:] = filter(
1900 lambda f: self._is_valid_url(
1901 f['url'], video_id,
1902 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1903 formats)
1904
1905 @staticmethod
1906 def _remove_duplicate_formats(formats):
1907 format_urls = set()
1908 unique_formats = []
1909 for f in formats:
1910 if f['url'] not in format_urls:
1911 format_urls.add(f['url'])
1912 unique_formats.append(f)
1913 formats[:] = unique_formats
1914
1915 def _is_valid_url(self, url, video_id, item='video', headers={}):
1916 url = self._proto_relative_url(url, scheme='http:')
1917 # For now assume non HTTP(S) URLs always valid
1918 if not (url.startswith('http://') or url.startswith('https://')):
1919 return True
1920 try:
1921 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1922 return True
1923 except ExtractorError as e:
1924 self.to_screen(
1925 '%s: %s URL is invalid, skipping: %s'
1926 % (video_id, item, error_to_compat_str(e.cause)))
1927 return False
1928
1929 def http_scheme(self):
1930 """ Either "http:" or "https:", depending on the user's preferences """
1931 return (
1932 'http:'
1933 if self.get_param('prefer_insecure', False)
1934 else 'https:')
1935
1936 def _proto_relative_url(self, url, scheme=None):
1937 if url is None:
1938 return url
1939 if url.startswith('//'):
1940 if scheme is None:
1941 scheme = self.http_scheme()
1942 return scheme + url
1943 else:
1944 return url
1945
1946 def _sleep(self, timeout, video_id, msg_template=None):
1947 if msg_template is None:
1948 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1949 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1950 self.to_screen(msg)
1951 time.sleep(timeout)
1952
1953 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1954 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1955 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1956 manifest = self._download_xml(
1957 manifest_url, video_id, 'Downloading f4m manifest',
1958 'Unable to download f4m manifest',
1959 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1960 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1961 transform_source=transform_source,
1962 fatal=fatal, data=data, headers=headers, query=query)
1963
1964 if manifest is False:
1965 return []
1966
1967 return self._parse_f4m_formats(
1968 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1969 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1970
1971 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1972 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1973 fatal=True, m3u8_id=None):
1974 if not isinstance(manifest, compat_etree_Element) and not fatal:
1975 return []
1976
1977 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1978 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1979 if akamai_pv is not None and ';' in akamai_pv.text:
1980 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1981 if playerVerificationChallenge.strip() != '':
1982 return []
1983
1984 formats = []
1985 manifest_version = '1.0'
1986 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1987 if not media_nodes:
1988 manifest_version = '2.0'
1989 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1990 # Remove unsupported DRM protected media from final formats
1991 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1992 media_nodes = remove_encrypted_media(media_nodes)
1993 if not media_nodes:
1994 return formats
1995
1996 manifest_base_url = get_base_url(manifest)
1997
1998 bootstrap_info = xpath_element(
1999 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2000 'bootstrap info', default=None)
2001
2002 vcodec = None
2003 mime_type = xpath_text(
2004 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2005 'base URL', default=None)
2006 if mime_type and mime_type.startswith('audio/'):
2007 vcodec = 'none'
2008
2009 for i, media_el in enumerate(media_nodes):
2010 tbr = int_or_none(media_el.attrib.get('bitrate'))
2011 width = int_or_none(media_el.attrib.get('width'))
2012 height = int_or_none(media_el.attrib.get('height'))
2013 format_id = join_nonempty(f4m_id, tbr or i)
2014 # If <bootstrapInfo> is present, the specified f4m is a
2015 # stream-level manifest, and only set-level manifests may refer to
2016 # external resources. See section 11.4 and section 4 of F4M spec
2017 if bootstrap_info is None:
2018 media_url = None
2019 # @href is introduced in 2.0, see section 11.6 of F4M spec
2020 if manifest_version == '2.0':
2021 media_url = media_el.attrib.get('href')
2022 if media_url is None:
2023 media_url = media_el.attrib.get('url')
2024 if not media_url:
2025 continue
2026 manifest_url = (
2027 media_url if media_url.startswith('http://') or media_url.startswith('https://')
2028 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2029 # If media_url is itself a f4m manifest do the recursive extraction
2030 # since bitrates in parent manifest (this one) and media_url manifest
2031 # may differ leading to inability to resolve the format by requested
2032 # bitrate in f4m downloader
2033 ext = determine_ext(manifest_url)
2034 if ext == 'f4m':
2035 f4m_formats = self._extract_f4m_formats(
2036 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2037 transform_source=transform_source, fatal=fatal)
2038 # Sometimes stream-level manifest contains single media entry that
2039 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2040 # At the same time parent's media entry in set-level manifest may
2041 # contain it. We will copy it from parent in such cases.
2042 if len(f4m_formats) == 1:
2043 f = f4m_formats[0]
2044 f.update({
2045 'tbr': f.get('tbr') or tbr,
2046 'width': f.get('width') or width,
2047 'height': f.get('height') or height,
2048 'format_id': f.get('format_id') if not tbr else format_id,
2049 'vcodec': vcodec,
2050 })
2051 formats.extend(f4m_formats)
2052 continue
2053 elif ext == 'm3u8':
2054 formats.extend(self._extract_m3u8_formats(
2055 manifest_url, video_id, 'mp4', preference=preference,
2056 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2057 continue
2058 formats.append({
2059 'format_id': format_id,
2060 'url': manifest_url,
2061 'manifest_url': manifest_url,
2062 'ext': 'flv' if bootstrap_info is not None else None,
2063 'protocol': 'f4m',
2064 'tbr': tbr,
2065 'width': width,
2066 'height': height,
2067 'vcodec': vcodec,
2068 'preference': preference,
2069 'quality': quality,
2070 })
2071 return formats
2072
2073 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2074 return {
2075 'format_id': join_nonempty(m3u8_id, 'meta'),
2076 'url': m3u8_url,
2077 'ext': ext,
2078 'protocol': 'm3u8',
2079 'preference': preference - 100 if preference else -100,
2080 'quality': quality,
2081 'resolution': 'multiple',
2082 'format_note': 'Quality selection URL',
2083 }
2084
2085 def _report_ignoring_subs(self, name):
2086 self.report_warning(bug_reports_message(
2087 f'Ignoring subtitle tracks found in the {name} manifest; '
2088 'if any subtitle tracks are missing,'
2089 ), only_once=True)
2090
2091 def _extract_m3u8_formats(self, *args, **kwargs):
2092 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2093 if subs:
2094 self._report_ignoring_subs('HLS')
2095 return fmts
2096
2097 def _extract_m3u8_formats_and_subtitles(
2098 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2099 preference=None, quality=None, m3u8_id=None, note=None,
2100 errnote=None, fatal=True, live=False, data=None, headers={},
2101 query={}):
2102
2103 res = self._download_webpage_handle(
2104 m3u8_url, video_id,
2105 note='Downloading m3u8 information' if note is None else note,
2106 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2107 fatal=fatal, data=data, headers=headers, query=query)
2108
2109 if res is False:
2110 return [], {}
2111
2112 m3u8_doc, urlh = res
2113 m3u8_url = urlh.geturl()
2114
2115 return self._parse_m3u8_formats_and_subtitles(
2116 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2117 preference=preference, quality=quality, m3u8_id=m3u8_id,
2118 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2119 headers=headers, query=query, video_id=video_id)
2120
2121 def _parse_m3u8_formats_and_subtitles(
2122 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2123 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2124 errnote=None, fatal=True, data=None, headers={}, query={},
2125 video_id=None):
2126 formats, subtitles = [], {}
2127
2128 has_drm = re.search('|'.join([
2129 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2130 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2131 ]), m3u8_doc)
2132
2133 def format_url(url):
2134 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2135
2136 if self.get_param('hls_split_discontinuity', False):
2137 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2138 if not m3u8_doc:
2139 if not manifest_url:
2140 return []
2141 m3u8_doc = self._download_webpage(
2142 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2143 note=False, errnote='Failed to download m3u8 playlist information')
2144 if m3u8_doc is False:
2145 return []
2146 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2147
2148 else:
2149 def _extract_m3u8_playlist_indices(*args, **kwargs):
2150 return [None]
2151
2152 # References:
2153 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2154 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2155 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2156
2157 # We should try extracting formats only from master playlists [1, 4.3.4],
2158 # i.e. playlists that describe available qualities. On the other hand
2159 # media playlists [1, 4.3.3] should be returned as is since they contain
2160 # just the media without qualities renditions.
2161 # Fortunately, master playlist can be easily distinguished from media
2162 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2163 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2164 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2165 # media playlist and MUST NOT appear in master playlist thus we can
2166 # clearly detect media playlist with this criterion.
2167
2168 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2169 formats = [{
2170 'format_id': join_nonempty(m3u8_id, idx),
2171 'format_index': idx,
2172 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2173 'ext': ext,
2174 'protocol': entry_protocol,
2175 'preference': preference,
2176 'quality': quality,
2177 'has_drm': has_drm,
2178 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2179
2180 return formats, subtitles
2181
2182 groups = {}
2183 last_stream_inf = {}
2184
2185 def extract_media(x_media_line):
2186 media = parse_m3u8_attributes(x_media_line)
2187 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2188 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2189 if not (media_type and group_id and name):
2190 return
2191 groups.setdefault(group_id, []).append(media)
2192 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2193 if media_type == 'SUBTITLES':
2194 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2195 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2196 # However, lack of URI has been spotted in the wild.
2197 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2198 if not media.get('URI'):
2199 return
2200 url = format_url(media['URI'])
2201 sub_info = {
2202 'url': url,
2203 'ext': determine_ext(url),
2204 }
2205 if sub_info['ext'] == 'm3u8':
2206 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2207 # files may contain is WebVTT:
2208 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2209 sub_info['ext'] = 'vtt'
2210 sub_info['protocol'] = 'm3u8_native'
2211 lang = media.get('LANGUAGE') or 'und'
2212 subtitles.setdefault(lang, []).append(sub_info)
2213 if media_type not in ('VIDEO', 'AUDIO'):
2214 return
2215 media_url = media.get('URI')
2216 if media_url:
2217 manifest_url = format_url(media_url)
2218 formats.extend({
2219 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2220 'format_note': name,
2221 'format_index': idx,
2222 'url': manifest_url,
2223 'manifest_url': m3u8_url,
2224 'language': media.get('LANGUAGE'),
2225 'ext': ext,
2226 'protocol': entry_protocol,
2227 'preference': preference,
2228 'quality': quality,
2229 'vcodec': 'none' if media_type == 'AUDIO' else None,
2230 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2231
2232 def build_stream_name():
2233 # Despite specification does not mention NAME attribute for
2234 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2235 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2236 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2237 stream_name = last_stream_inf.get('NAME')
2238 if stream_name:
2239 return stream_name
2240 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2241 # from corresponding rendition group
2242 stream_group_id = last_stream_inf.get('VIDEO')
2243 if not stream_group_id:
2244 return
2245 stream_group = groups.get(stream_group_id)
2246 if not stream_group:
2247 return stream_group_id
2248 rendition = stream_group[0]
2249 return rendition.get('NAME') or stream_group_id
2250
2251 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2252 # chance to detect video only formats when EXT-X-STREAM-INF tags
2253 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2254 for line in m3u8_doc.splitlines():
2255 if line.startswith('#EXT-X-MEDIA:'):
2256 extract_media(line)
2257
2258 for line in m3u8_doc.splitlines():
2259 if line.startswith('#EXT-X-STREAM-INF:'):
2260 last_stream_inf = parse_m3u8_attributes(line)
2261 elif line.startswith('#') or not line.strip():
2262 continue
2263 else:
2264 tbr = float_or_none(
2265 last_stream_inf.get('AVERAGE-BANDWIDTH')
2266 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2267 manifest_url = format_url(line.strip())
2268
2269 for idx in _extract_m3u8_playlist_indices(manifest_url):
2270 format_id = [m3u8_id, None, idx]
2271 # Bandwidth of live streams may differ over time thus making
2272 # format_id unpredictable. So it's better to keep provided
2273 # format_id intact.
2274 if not live:
2275 stream_name = build_stream_name()
2276 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2277 f = {
2278 'format_id': join_nonempty(*format_id),
2279 'format_index': idx,
2280 'url': manifest_url,
2281 'manifest_url': m3u8_url,
2282 'tbr': tbr,
2283 'ext': ext,
2284 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2285 'protocol': entry_protocol,
2286 'preference': preference,
2287 'quality': quality,
2288 }
2289 resolution = last_stream_inf.get('RESOLUTION')
2290 if resolution:
2291 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2292 if mobj:
2293 f['width'] = int(mobj.group('width'))
2294 f['height'] = int(mobj.group('height'))
2295 # Unified Streaming Platform
2296 mobj = re.search(
2297 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2298 if mobj:
2299 abr, vbr = mobj.groups()
2300 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2301 f.update({
2302 'vbr': vbr,
2303 'abr': abr,
2304 })
2305 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2306 f.update(codecs)
2307 audio_group_id = last_stream_inf.get('AUDIO')
2308 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2309 # references a rendition group MUST have a CODECS attribute.
2310 # However, this is not always respected, for example, [2]
2311 # contains EXT-X-STREAM-INF tag which references AUDIO
2312 # rendition group but does not have CODECS and despite
2313 # referencing an audio group it represents a complete
2314 # (with audio and video) format. So, for such cases we will
2315 # ignore references to rendition groups and treat them
2316 # as complete formats.
2317 if audio_group_id and codecs and f.get('vcodec') != 'none':
2318 audio_group = groups.get(audio_group_id)
2319 if audio_group and audio_group[0].get('URI'):
2320 # TODO: update acodec for audio only formats with
2321 # the same GROUP-ID
2322 f['acodec'] = 'none'
2323 if not f.get('ext'):
2324 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2325 formats.append(f)
2326
2327 # for DailyMotion
2328 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2329 if progressive_uri:
2330 http_f = f.copy()
2331 del http_f['manifest_url']
2332 http_f.update({
2333 'format_id': f['format_id'].replace('hls-', 'http-'),
2334 'protocol': 'http',
2335 'url': progressive_uri,
2336 })
2337 formats.append(http_f)
2338
2339 last_stream_inf = {}
2340 return formats, subtitles
2341
2342 def _extract_m3u8_vod_duration(
2343 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2344
2345 m3u8_vod = self._download_webpage(
2346 m3u8_vod_url, video_id,
2347 note='Downloading m3u8 VOD manifest' if note is None else note,
2348 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2349 fatal=False, data=data, headers=headers, query=query)
2350
2351 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2352
2353 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2354 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2355 return None
2356
2357 return int(sum(
2358 float(line[len('#EXTINF:'):].split(',')[0])
2359 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2360
2361 @staticmethod
2362 def _xpath_ns(path, namespace=None):
2363 if not namespace:
2364 return path
2365 out = []
2366 for c in path.split('/'):
2367 if not c or c == '.':
2368 out.append(c)
2369 else:
2370 out.append('{%s}%s' % (namespace, c))
2371 return '/'.join(out)
2372
2373 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2374 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2375
2376 if smil is False:
2377 assert not fatal
2378 return [], {}
2379
2380 namespace = self._parse_smil_namespace(smil)
2381
2382 fmts = self._parse_smil_formats(
2383 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2384 subs = self._parse_smil_subtitles(
2385 smil, namespace=namespace)
2386
2387 return fmts, subs
2388
2389 def _extract_smil_formats(self, *args, **kwargs):
2390 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2391 if subs:
2392 self._report_ignoring_subs('SMIL')
2393 return fmts
2394
2395 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2396 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2397 if smil is False:
2398 return {}
2399 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2400
2401 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2402 return self._download_xml(
2403 smil_url, video_id, 'Downloading SMIL file',
2404 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2405
2406 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2407 namespace = self._parse_smil_namespace(smil)
2408
2409 formats = self._parse_smil_formats(
2410 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2411 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2412
2413 video_id = os.path.splitext(url_basename(smil_url))[0]
2414 title = None
2415 description = None
2416 upload_date = None
2417 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2418 name = meta.attrib.get('name')
2419 content = meta.attrib.get('content')
2420 if not name or not content:
2421 continue
2422 if not title and name == 'title':
2423 title = content
2424 elif not description and name in ('description', 'abstract'):
2425 description = content
2426 elif not upload_date and name == 'date':
2427 upload_date = unified_strdate(content)
2428
2429 thumbnails = [{
2430 'id': image.get('type'),
2431 'url': image.get('src'),
2432 'width': int_or_none(image.get('width')),
2433 'height': int_or_none(image.get('height')),
2434 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2435
2436 return {
2437 'id': video_id,
2438 'title': title or video_id,
2439 'description': description,
2440 'upload_date': upload_date,
2441 'thumbnails': thumbnails,
2442 'formats': formats,
2443 'subtitles': subtitles,
2444 }
2445
2446 def _parse_smil_namespace(self, smil):
2447 return self._search_regex(
2448 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2449
2450 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2451 base = smil_url
2452 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2453 b = meta.get('base') or meta.get('httpBase')
2454 if b:
2455 base = b
2456 break
2457
2458 formats = []
2459 rtmp_count = 0
2460 http_count = 0
2461 m3u8_count = 0
2462 imgs_count = 0
2463
2464 srcs = set()
2465 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2466 for medium in media:
2467 src = medium.get('src')
2468 if not src or src in srcs:
2469 continue
2470 srcs.add(src)
2471
2472 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2473 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2474 width = int_or_none(medium.get('width'))
2475 height = int_or_none(medium.get('height'))
2476 proto = medium.get('proto')
2477 ext = medium.get('ext')
2478 src_ext = determine_ext(src)
2479 streamer = medium.get('streamer') or base
2480
2481 if proto == 'rtmp' or streamer.startswith('rtmp'):
2482 rtmp_count += 1
2483 formats.append({
2484 'url': streamer,
2485 'play_path': src,
2486 'ext': 'flv',
2487 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2488 'tbr': bitrate,
2489 'filesize': filesize,
2490 'width': width,
2491 'height': height,
2492 })
2493 if transform_rtmp_url:
2494 streamer, src = transform_rtmp_url(streamer, src)
2495 formats[-1].update({
2496 'url': streamer,
2497 'play_path': src,
2498 })
2499 continue
2500
2501 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2502 src_url = src_url.strip()
2503
2504 if proto == 'm3u8' or src_ext == 'm3u8':
2505 m3u8_formats = self._extract_m3u8_formats(
2506 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2507 if len(m3u8_formats) == 1:
2508 m3u8_count += 1
2509 m3u8_formats[0].update({
2510 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2511 'tbr': bitrate,
2512 'width': width,
2513 'height': height,
2514 })
2515 formats.extend(m3u8_formats)
2516 elif src_ext == 'f4m':
2517 f4m_url = src_url
2518 if not f4m_params:
2519 f4m_params = {
2520 'hdcore': '3.2.0',
2521 'plugin': 'flowplayer-3.2.0.1',
2522 }
2523 f4m_url += '&' if '?' in f4m_url else '?'
2524 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2525 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2526 elif src_ext == 'mpd':
2527 formats.extend(self._extract_mpd_formats(
2528 src_url, video_id, mpd_id='dash', fatal=False))
2529 elif re.search(r'\.ism/[Mm]anifest', src_url):
2530 formats.extend(self._extract_ism_formats(
2531 src_url, video_id, ism_id='mss', fatal=False))
2532 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2533 http_count += 1
2534 formats.append({
2535 'url': src_url,
2536 'ext': ext or src_ext or 'flv',
2537 'format_id': 'http-%d' % (bitrate or http_count),
2538 'tbr': bitrate,
2539 'filesize': filesize,
2540 'width': width,
2541 'height': height,
2542 })
2543
2544 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2545 src = medium.get('src')
2546 if not src or src in srcs:
2547 continue
2548 srcs.add(src)
2549
2550 imgs_count += 1
2551 formats.append({
2552 'format_id': 'imagestream-%d' % (imgs_count),
2553 'url': src,
2554 'ext': mimetype2ext(medium.get('type')),
2555 'acodec': 'none',
2556 'vcodec': 'none',
2557 'width': int_or_none(medium.get('width')),
2558 'height': int_or_none(medium.get('height')),
2559 'format_note': 'SMIL storyboards',
2560 })
2561
2562 return formats
2563
2564 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2565 urls = []
2566 subtitles = {}
2567 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2568 src = textstream.get('src')
2569 if not src or src in urls:
2570 continue
2571 urls.append(src)
2572 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2573 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2574 subtitles.setdefault(lang, []).append({
2575 'url': src,
2576 'ext': ext,
2577 })
2578 return subtitles
2579
2580 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2581 xspf = self._download_xml(
2582 xspf_url, playlist_id, 'Downloading xpsf playlist',
2583 'Unable to download xspf manifest', fatal=fatal)
2584 if xspf is False:
2585 return []
2586 return self._parse_xspf(
2587 xspf, playlist_id, xspf_url=xspf_url,
2588 xspf_base_url=base_url(xspf_url))
2589
2590 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2591 NS_MAP = {
2592 'xspf': 'http://xspf.org/ns/0/',
2593 's1': 'http://static.streamone.nl/player/ns/0',
2594 }
2595
2596 entries = []
2597 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2598 title = xpath_text(
2599 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2600 description = xpath_text(
2601 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2602 thumbnail = xpath_text(
2603 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2604 duration = float_or_none(
2605 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2606
2607 formats = []
2608 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2609 format_url = urljoin(xspf_base_url, location.text)
2610 if not format_url:
2611 continue
2612 formats.append({
2613 'url': format_url,
2614 'manifest_url': xspf_url,
2615 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2616 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2617 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2618 })
2619 self._sort_formats(formats)
2620
2621 entries.append({
2622 'id': playlist_id,
2623 'title': title,
2624 'description': description,
2625 'thumbnail': thumbnail,
2626 'duration': duration,
2627 'formats': formats,
2628 })
2629 return entries
2630
2631 def _extract_mpd_formats(self, *args, **kwargs):
2632 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2633 if subs:
2634 self._report_ignoring_subs('DASH')
2635 return fmts
2636
2637 def _extract_mpd_formats_and_subtitles(
2638 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2639 fatal=True, data=None, headers={}, query={}):
2640 res = self._download_xml_handle(
2641 mpd_url, video_id,
2642 note='Downloading MPD manifest' if note is None else note,
2643 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2644 fatal=fatal, data=data, headers=headers, query=query)
2645 if res is False:
2646 return [], {}
2647 mpd_doc, urlh = res
2648 if mpd_doc is None:
2649 return [], {}
2650 mpd_base_url = base_url(urlh.geturl())
2651
2652 return self._parse_mpd_formats_and_subtitles(
2653 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2654
2655 def _parse_mpd_formats(self, *args, **kwargs):
2656 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2657 if subs:
2658 self._report_ignoring_subs('DASH')
2659 return fmts
2660
2661 def _parse_mpd_formats_and_subtitles(
2662 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2663 """
2664 Parse formats from MPD manifest.
2665 References:
2666 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2667 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2668 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2669 """
2670 if not self.get_param('dynamic_mpd', True):
2671 if mpd_doc.get('type') == 'dynamic':
2672 return [], {}
2673
2674 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2675
2676 def _add_ns(path):
2677 return self._xpath_ns(path, namespace)
2678
2679 def is_drm_protected(element):
2680 return element.find(_add_ns('ContentProtection')) is not None
2681
2682 def extract_multisegment_info(element, ms_parent_info):
2683 ms_info = ms_parent_info.copy()
2684
2685 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2686 # common attributes and elements. We will only extract relevant
2687 # for us.
2688 def extract_common(source):
2689 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2690 if segment_timeline is not None:
2691 s_e = segment_timeline.findall(_add_ns('S'))
2692 if s_e:
2693 ms_info['total_number'] = 0
2694 ms_info['s'] = []
2695 for s in s_e:
2696 r = int(s.get('r', 0))
2697 ms_info['total_number'] += 1 + r
2698 ms_info['s'].append({
2699 't': int(s.get('t', 0)),
2700 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2701 'd': int(s.attrib['d']),
2702 'r': r,
2703 })
2704 start_number = source.get('startNumber')
2705 if start_number:
2706 ms_info['start_number'] = int(start_number)
2707 timescale = source.get('timescale')
2708 if timescale:
2709 ms_info['timescale'] = int(timescale)
2710 segment_duration = source.get('duration')
2711 if segment_duration:
2712 ms_info['segment_duration'] = float(segment_duration)
2713
2714 def extract_Initialization(source):
2715 initialization = source.find(_add_ns('Initialization'))
2716 if initialization is not None:
2717 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2718
2719 segment_list = element.find(_add_ns('SegmentList'))
2720 if segment_list is not None:
2721 extract_common(segment_list)
2722 extract_Initialization(segment_list)
2723 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2724 if segment_urls_e:
2725 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2726 else:
2727 segment_template = element.find(_add_ns('SegmentTemplate'))
2728 if segment_template is not None:
2729 extract_common(segment_template)
2730 media = segment_template.get('media')
2731 if media:
2732 ms_info['media'] = media
2733 initialization = segment_template.get('initialization')
2734 if initialization:
2735 ms_info['initialization'] = initialization
2736 else:
2737 extract_Initialization(segment_template)
2738 return ms_info
2739
2740 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2741 formats, subtitles = [], {}
2742 stream_numbers = collections.defaultdict(int)
2743 for period in mpd_doc.findall(_add_ns('Period')):
2744 period_duration = parse_duration(period.get('duration')) or mpd_duration
2745 period_ms_info = extract_multisegment_info(period, {
2746 'start_number': 1,
2747 'timescale': 1,
2748 })
2749 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2750 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2751 for representation in adaptation_set.findall(_add_ns('Representation')):
2752 representation_attrib = adaptation_set.attrib.copy()
2753 representation_attrib.update(representation.attrib)
2754 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2755 mime_type = representation_attrib['mimeType']
2756 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2757
2758 codecs = parse_codecs(representation_attrib.get('codecs', ''))
2759 if content_type not in ('video', 'audio', 'text'):
2760 if mime_type == 'image/jpeg':
2761 content_type = mime_type
2762 elif codecs['vcodec'] != 'none':
2763 content_type = 'video'
2764 elif codecs['acodec'] != 'none':
2765 content_type = 'audio'
2766 elif codecs.get('tcodec', 'none') != 'none':
2767 content_type = 'text'
2768 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2769 content_type = 'text'
2770 else:
2771 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2772 continue
2773
2774 base_url = ''
2775 for element in (representation, adaptation_set, period, mpd_doc):
2776 base_url_e = element.find(_add_ns('BaseURL'))
2777 if base_url_e is not None:
2778 base_url = base_url_e.text + base_url
2779 if re.match(r'^https?://', base_url):
2780 break
2781 if mpd_base_url and base_url.startswith('/'):
2782 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2783 elif mpd_base_url and not re.match(r'^https?://', base_url):
2784 if not mpd_base_url.endswith('/'):
2785 mpd_base_url += '/'
2786 base_url = mpd_base_url + base_url
2787 representation_id = representation_attrib.get('id')
2788 lang = representation_attrib.get('lang')
2789 url_el = representation.find(_add_ns('BaseURL'))
2790 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2791 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2792 if representation_id is not None:
2793 format_id = representation_id
2794 else:
2795 format_id = content_type
2796 if mpd_id:
2797 format_id = mpd_id + '-' + format_id
2798 if content_type in ('video', 'audio'):
2799 f = {
2800 'format_id': format_id,
2801 'manifest_url': mpd_url,
2802 'ext': mimetype2ext(mime_type),
2803 'width': int_or_none(representation_attrib.get('width')),
2804 'height': int_or_none(representation_attrib.get('height')),
2805 'tbr': float_or_none(bandwidth, 1000),
2806 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2807 'fps': int_or_none(representation_attrib.get('frameRate')),
2808 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2809 'format_note': 'DASH %s' % content_type,
2810 'filesize': filesize,
2811 'container': mimetype2ext(mime_type) + '_dash',
2812 **codecs
2813 }
2814 elif content_type == 'text':
2815 f = {
2816 'ext': mimetype2ext(mime_type),
2817 'manifest_url': mpd_url,
2818 'filesize': filesize,
2819 }
2820 elif content_type == 'image/jpeg':
2821 # See test case in VikiIE
2822 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2823 f = {
2824 'format_id': format_id,
2825 'ext': 'mhtml',
2826 'manifest_url': mpd_url,
2827 'format_note': 'DASH storyboards (jpeg)',
2828 'acodec': 'none',
2829 'vcodec': 'none',
2830 }
2831 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2832 f['has_drm'] = True
2833 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2834
2835 def prepare_template(template_name, identifiers):
2836 tmpl = representation_ms_info[template_name]
2837 # First of, % characters outside $...$ templates
2838 # must be escaped by doubling for proper processing
2839 # by % operator string formatting used further (see
2840 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2841 t = ''
2842 in_template = False
2843 for c in tmpl:
2844 t += c
2845 if c == '$':
2846 in_template = not in_template
2847 elif c == '%' and not in_template:
2848 t += c
2849 # Next, $...$ templates are translated to their
2850 # %(...) counterparts to be used with % operator
2851 if representation_id is not None:
2852 t = t.replace('$RepresentationID$', representation_id)
2853 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2854 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2855 t.replace('$$', '$')
2856 return t
2857
2858 # @initialization is a regular template like @media one
2859 # so it should be handled just the same way (see
2860 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2861 if 'initialization' in representation_ms_info:
2862 initialization_template = prepare_template(
2863 'initialization',
2864 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2865 # $Time$ shall not be included for @initialization thus
2866 # only $Bandwidth$ remains
2867 ('Bandwidth', ))
2868 representation_ms_info['initialization_url'] = initialization_template % {
2869 'Bandwidth': bandwidth,
2870 }
2871
2872 def location_key(location):
2873 return 'url' if re.match(r'^https?://', location) else 'path'
2874
2875 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2876
2877 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2878 media_location_key = location_key(media_template)
2879
2880 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2881 # can't be used at the same time
2882 if '%(Number' in media_template and 's' not in representation_ms_info:
2883 segment_duration = None
2884 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2885 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2886 representation_ms_info['total_number'] = int(math.ceil(
2887 float_or_none(period_duration, segment_duration, default=0)))
2888 representation_ms_info['fragments'] = [{
2889 media_location_key: media_template % {
2890 'Number': segment_number,
2891 'Bandwidth': bandwidth,
2892 },
2893 'duration': segment_duration,
2894 } for segment_number in range(
2895 representation_ms_info['start_number'],
2896 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2897 else:
2898 # $Number*$ or $Time$ in media template with S list available
2899 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2900 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2901 representation_ms_info['fragments'] = []
2902 segment_time = 0
2903 segment_d = None
2904 segment_number = representation_ms_info['start_number']
2905
2906 def add_segment_url():
2907 segment_url = media_template % {
2908 'Time': segment_time,
2909 'Bandwidth': bandwidth,
2910 'Number': segment_number,
2911 }
2912 representation_ms_info['fragments'].append({
2913 media_location_key: segment_url,
2914 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2915 })
2916
2917 for num, s in enumerate(representation_ms_info['s']):
2918 segment_time = s.get('t') or segment_time
2919 segment_d = s['d']
2920 add_segment_url()
2921 segment_number += 1
2922 for r in range(s.get('r', 0)):
2923 segment_time += segment_d
2924 add_segment_url()
2925 segment_number += 1
2926 segment_time += segment_d
2927 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2928 # No media template
2929 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2930 # or any YouTube dashsegments video
2931 fragments = []
2932 segment_index = 0
2933 timescale = representation_ms_info['timescale']
2934 for s in representation_ms_info['s']:
2935 duration = float_or_none(s['d'], timescale)
2936 for r in range(s.get('r', 0) + 1):
2937 segment_uri = representation_ms_info['segment_urls'][segment_index]
2938 fragments.append({
2939 location_key(segment_uri): segment_uri,
2940 'duration': duration,
2941 })
2942 segment_index += 1
2943 representation_ms_info['fragments'] = fragments
2944 elif 'segment_urls' in representation_ms_info:
2945 # Segment URLs with no SegmentTimeline
2946 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2947 # https://github.com/ytdl-org/youtube-dl/pull/14844
2948 fragments = []
2949 segment_duration = float_or_none(
2950 representation_ms_info['segment_duration'],
2951 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2952 for segment_url in representation_ms_info['segment_urls']:
2953 fragment = {
2954 location_key(segment_url): segment_url,
2955 }
2956 if segment_duration:
2957 fragment['duration'] = segment_duration
2958 fragments.append(fragment)
2959 representation_ms_info['fragments'] = fragments
2960 # If there is a fragments key available then we correctly recognized fragmented media.
2961 # Otherwise we will assume unfragmented media with direct access. Technically, such
2962 # assumption is not necessarily correct since we may simply have no support for
2963 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2964 if 'fragments' in representation_ms_info:
2965 f.update({
2966 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2967 'url': mpd_url or base_url,
2968 'fragment_base_url': base_url,
2969 'fragments': [],
2970 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2971 })
2972 if 'initialization_url' in representation_ms_info:
2973 initialization_url = representation_ms_info['initialization_url']
2974 if not f.get('url'):
2975 f['url'] = initialization_url
2976 f['fragments'].append({location_key(initialization_url): initialization_url})
2977 f['fragments'].extend(representation_ms_info['fragments'])
2978 if not period_duration:
2979 period_duration = try_get(
2980 representation_ms_info,
2981 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2982 else:
2983 # Assuming direct URL to unfragmented media.
2984 f['url'] = base_url
2985 if content_type in ('video', 'audio', 'image/jpeg'):
2986 f['manifest_stream_number'] = stream_numbers[f['url']]
2987 stream_numbers[f['url']] += 1
2988 formats.append(f)
2989 elif content_type == 'text':
2990 subtitles.setdefault(lang or 'und', []).append(f)
2991
2992 return formats, subtitles
2993
2994 def _extract_ism_formats(self, *args, **kwargs):
2995 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2996 if subs:
2997 self._report_ignoring_subs('ISM')
2998 return fmts
2999
3000 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3001 res = self._download_xml_handle(
3002 ism_url, video_id,
3003 note='Downloading ISM manifest' if note is None else note,
3004 errnote='Failed to download ISM manifest' if errnote is None else errnote,
3005 fatal=fatal, data=data, headers=headers, query=query)
3006 if res is False:
3007 return [], {}
3008 ism_doc, urlh = res
3009 if ism_doc is None:
3010 return [], {}
3011
3012 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3013
3014 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3015 """
3016 Parse formats from ISM manifest.
3017 References:
3018 1. [MS-SSTR]: Smooth Streaming Protocol,
3019 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3020 """
3021 if ism_doc.get('IsLive') == 'TRUE':
3022 return [], {}
3023
3024 duration = int(ism_doc.attrib['Duration'])
3025 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3026
3027 formats = []
3028 subtitles = {}
3029 for stream in ism_doc.findall('StreamIndex'):
3030 stream_type = stream.get('Type')
3031 if stream_type not in ('video', 'audio', 'text'):
3032 continue
3033 url_pattern = stream.attrib['Url']
3034 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3035 stream_name = stream.get('Name')
3036 stream_language = stream.get('Language', 'und')
3037 for track in stream.findall('QualityLevel'):
3038 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3039 # TODO: add support for WVC1 and WMAP
3040 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3041 self.report_warning('%s is not a supported codec' % fourcc)
3042 continue
3043 tbr = int(track.attrib['Bitrate']) // 1000
3044 # [1] does not mention Width and Height attributes. However,
3045 # they're often present while MaxWidth and MaxHeight are
3046 # missing, so should be used as fallbacks
3047 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3048 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3049 sampling_rate = int_or_none(track.get('SamplingRate'))
3050
3051 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3052 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3053
3054 fragments = []
3055 fragment_ctx = {
3056 'time': 0,
3057 }
3058 stream_fragments = stream.findall('c')
3059 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3060 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3061 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3062 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3063 if not fragment_ctx['duration']:
3064 try:
3065 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3066 except IndexError:
3067 next_fragment_time = duration
3068 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3069 for _ in range(fragment_repeat):
3070 fragments.append({
3071 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3072 'duration': fragment_ctx['duration'] / stream_timescale,
3073 })
3074 fragment_ctx['time'] += fragment_ctx['duration']
3075
3076 if stream_type == 'text':
3077 subtitles.setdefault(stream_language, []).append({
3078 'ext': 'ismt',
3079 'protocol': 'ism',
3080 'url': ism_url,
3081 'manifest_url': ism_url,
3082 'fragments': fragments,
3083 '_download_params': {
3084 'stream_type': stream_type,
3085 'duration': duration,
3086 'timescale': stream_timescale,
3087 'fourcc': fourcc,
3088 'language': stream_language,
3089 'codec_private_data': track.get('CodecPrivateData'),
3090 }
3091 })
3092 elif stream_type in ('video', 'audio'):
3093 formats.append({
3094 'format_id': join_nonempty(ism_id, stream_name, tbr),
3095 'url': ism_url,
3096 'manifest_url': ism_url,
3097 'ext': 'ismv' if stream_type == 'video' else 'isma',
3098 'width': width,
3099 'height': height,
3100 'tbr': tbr,
3101 'asr': sampling_rate,
3102 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3103 'acodec': 'none' if stream_type == 'video' else fourcc,
3104 'protocol': 'ism',
3105 'fragments': fragments,
3106 'has_drm': ism_doc.find('Protection') is not None,
3107 '_download_params': {
3108 'stream_type': stream_type,
3109 'duration': duration,
3110 'timescale': stream_timescale,
3111 'width': width or 0,
3112 'height': height or 0,
3113 'fourcc': fourcc,
3114 'language': stream_language,
3115 'codec_private_data': track.get('CodecPrivateData'),
3116 'sampling_rate': sampling_rate,
3117 'channels': int_or_none(track.get('Channels', 2)),
3118 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3119 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3120 },
3121 })
3122 return formats, subtitles
3123
3124 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3125 def absolute_url(item_url):
3126 return urljoin(base_url, item_url)
3127
3128 def parse_content_type(content_type):
3129 if not content_type:
3130 return {}
3131 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3132 if ctr:
3133 mimetype, codecs = ctr.groups()
3134 f = parse_codecs(codecs)
3135 f['ext'] = mimetype2ext(mimetype)
3136 return f
3137 return {}
3138
3139 def _media_formats(src, cur_media_type, type_info={}):
3140 full_url = absolute_url(src)
3141 ext = type_info.get('ext') or determine_ext(full_url)
3142 if ext == 'm3u8':
3143 is_plain_url = False
3144 formats = self._extract_m3u8_formats(
3145 full_url, video_id, ext='mp4',
3146 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3147 preference=preference, quality=quality, fatal=False)
3148 elif ext == 'mpd':
3149 is_plain_url = False
3150 formats = self._extract_mpd_formats(
3151 full_url, video_id, mpd_id=mpd_id, fatal=False)
3152 else:
3153 is_plain_url = True
3154 formats = [{
3155 'url': full_url,
3156 'vcodec': 'none' if cur_media_type == 'audio' else None,
3157 }]
3158 return is_plain_url, formats
3159
3160 entries = []
3161 # amp-video and amp-audio are very similar to their HTML5 counterparts
3162 # so we wll include them right here (see
3163 # https://www.ampproject.org/docs/reference/components/amp-video)
3164 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3165 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3166 media_tags = [(media_tag, media_tag_name, media_type, '')
3167 for media_tag, media_tag_name, media_type
3168 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3169 media_tags.extend(re.findall(
3170 # We only allow video|audio followed by a whitespace or '>'.
3171 # Allowing more characters may end up in significant slow down (see
3172 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3173 # http://www.porntrex.com/maps/videositemap.xml).
3174 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3175 for media_tag, _, media_type, media_content in media_tags:
3176 media_info = {
3177 'formats': [],
3178 'subtitles': {},
3179 }
3180 media_attributes = extract_attributes(media_tag)
3181 src = strip_or_none(media_attributes.get('src'))
3182 if src:
3183 _, formats = _media_formats(src, media_type)
3184 media_info['formats'].extend(formats)
3185 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3186 if media_content:
3187 for source_tag in re.findall(r'<source[^>]+>', media_content):
3188 s_attr = extract_attributes(source_tag)
3189 # data-video-src and data-src are non standard but seen
3190 # several times in the wild
3191 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3192 if not src:
3193 continue
3194 f = parse_content_type(s_attr.get('type'))
3195 is_plain_url, formats = _media_formats(src, media_type, f)
3196 if is_plain_url:
3197 # width, height, res, label and title attributes are
3198 # all not standard but seen several times in the wild
3199 labels = [
3200 s_attr.get(lbl)
3201 for lbl in ('label', 'title')
3202 if str_or_none(s_attr.get(lbl))
3203 ]
3204 width = int_or_none(s_attr.get('width'))
3205 height = (int_or_none(s_attr.get('height'))
3206 or int_or_none(s_attr.get('res')))
3207 if not width or not height:
3208 for lbl in labels:
3209 resolution = parse_resolution(lbl)
3210 if not resolution:
3211 continue
3212 width = width or resolution.get('width')
3213 height = height or resolution.get('height')
3214 for lbl in labels:
3215 tbr = parse_bitrate(lbl)
3216 if tbr:
3217 break
3218 else:
3219 tbr = None
3220 f.update({
3221 'width': width,
3222 'height': height,
3223 'tbr': tbr,
3224 'format_id': s_attr.get('label') or s_attr.get('title'),
3225 })
3226 f.update(formats[0])
3227 media_info['formats'].append(f)
3228 else:
3229 media_info['formats'].extend(formats)
3230 for track_tag in re.findall(r'<track[^>]+>', media_content):
3231 track_attributes = extract_attributes(track_tag)
3232 kind = track_attributes.get('kind')
3233 if not kind or kind in ('subtitles', 'captions'):
3234 src = strip_or_none(track_attributes.get('src'))
3235 if not src:
3236 continue
3237 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3238 media_info['subtitles'].setdefault(lang, []).append({
3239 'url': absolute_url(src),
3240 })
3241 for f in media_info['formats']:
3242 f.setdefault('http_headers', {})['Referer'] = base_url
3243 if media_info['formats'] or media_info['subtitles']:
3244 entries.append(media_info)
3245 return entries
3246
3247 def _extract_akamai_formats(self, *args, **kwargs):
3248 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3249 if subs:
3250 self._report_ignoring_subs('akamai')
3251 return fmts
3252
3253 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3254 signed = 'hdnea=' in manifest_url
3255 if not signed:
3256 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3257 manifest_url = re.sub(
3258 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3259 '', manifest_url).strip('?')
3260
3261 formats = []
3262 subtitles = {}
3263
3264 hdcore_sign = 'hdcore=3.7.0'
3265 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3266 hds_host = hosts.get('hds')
3267 if hds_host:
3268 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3269 if 'hdcore=' not in f4m_url:
3270 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3271 f4m_formats = self._extract_f4m_formats(
3272 f4m_url, video_id, f4m_id='hds', fatal=False)
3273 for entry in f4m_formats:
3274 entry.update({'extra_param_to_segment_url': hdcore_sign})
3275 formats.extend(f4m_formats)
3276
3277 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3278 hls_host = hosts.get('hls')
3279 if hls_host:
3280 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3281 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3282 m3u8_url, video_id, 'mp4', 'm3u8_native',
3283 m3u8_id='hls', fatal=False)
3284 formats.extend(m3u8_formats)
3285 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3286
3287 http_host = hosts.get('http')
3288 if http_host and m3u8_formats and not signed:
3289 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3290 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3291 qualities_length = len(qualities)
3292 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3293 i = 0
3294 for f in m3u8_formats:
3295 if f['vcodec'] != 'none':
3296 for protocol in ('http', 'https'):
3297 http_f = f.copy()
3298 del http_f['manifest_url']
3299 http_url = re.sub(
3300 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3301 http_f.update({
3302 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3303 'url': http_url,
3304 'protocol': protocol,
3305 })
3306 formats.append(http_f)
3307 i += 1
3308
3309 return formats, subtitles
3310
3311 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3312 query = compat_urlparse.urlparse(url).query
3313 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3314 mobj = re.search(
3315 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3316 url_base = mobj.group('url')
3317 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3318 formats = []
3319
3320 def manifest_url(manifest):
3321 m_url = '%s/%s' % (http_base_url, manifest)
3322 if query:
3323 m_url += '?%s' % query
3324 return m_url
3325
3326 if 'm3u8' not in skip_protocols:
3327 formats.extend(self._extract_m3u8_formats(
3328 manifest_url('playlist.m3u8'), video_id, 'mp4',
3329 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3330 if 'f4m' not in skip_protocols:
3331 formats.extend(self._extract_f4m_formats(
3332 manifest_url('manifest.f4m'),
3333 video_id, f4m_id='hds', fatal=False))
3334 if 'dash' not in skip_protocols:
3335 formats.extend(self._extract_mpd_formats(
3336 manifest_url('manifest.mpd'),
3337 video_id, mpd_id='dash', fatal=False))
3338 if re.search(r'(?:/smil:|\.smil)', url_base):
3339 if 'smil' not in skip_protocols:
3340 rtmp_formats = self._extract_smil_formats(
3341 manifest_url('jwplayer.smil'),
3342 video_id, fatal=False)
3343 for rtmp_format in rtmp_formats:
3344 rtsp_format = rtmp_format.copy()
3345 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3346 del rtsp_format['play_path']
3347 del rtsp_format['ext']
3348 rtsp_format.update({
3349 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3350 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3351 'protocol': 'rtsp',
3352 })
3353 formats.extend([rtmp_format, rtsp_format])
3354 else:
3355 for protocol in ('rtmp', 'rtsp'):
3356 if protocol not in skip_protocols:
3357 formats.append({
3358 'url': '%s:%s' % (protocol, url_base),
3359 'format_id': protocol,
3360 'protocol': protocol,
3361 })
3362 return formats
3363
3364 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3365 mobj = re.search(
3366 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3367 webpage)
3368 if mobj:
3369 try:
3370 jwplayer_data = self._parse_json(mobj.group('options'),
3371 video_id=video_id,
3372 transform_source=transform_source)
3373 except ExtractorError:
3374 pass
3375 else:
3376 if isinstance(jwplayer_data, dict):
3377 return jwplayer_data
3378
3379 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3380 jwplayer_data = self._find_jwplayer_data(
3381 webpage, video_id, transform_source=js_to_json)
3382 return self._parse_jwplayer_data(
3383 jwplayer_data, video_id, *args, **kwargs)
3384
3385 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3386 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3387 # JWPlayer backward compatibility: flattened playlists
3388 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3389 if 'playlist' not in jwplayer_data:
3390 jwplayer_data = {'playlist': [jwplayer_data]}
3391
3392 entries = []
3393
3394 # JWPlayer backward compatibility: single playlist item
3395 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3396 if not isinstance(jwplayer_data['playlist'], list):
3397 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3398
3399 for video_data in jwplayer_data['playlist']:
3400 # JWPlayer backward compatibility: flattened sources
3401 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3402 if 'sources' not in video_data:
3403 video_data['sources'] = [video_data]
3404
3405 this_video_id = video_id or video_data['mediaid']
3406
3407 formats = self._parse_jwplayer_formats(
3408 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3409 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3410
3411 subtitles = {}
3412 tracks = video_data.get('tracks')
3413 if tracks and isinstance(tracks, list):
3414 for track in tracks:
3415 if not isinstance(track, dict):
3416 continue
3417 track_kind = track.get('kind')
3418 if not track_kind or not isinstance(track_kind, compat_str):
3419 continue
3420 if track_kind.lower() not in ('captions', 'subtitles'):
3421 continue
3422 track_url = urljoin(base_url, track.get('file'))
3423 if not track_url:
3424 continue
3425 subtitles.setdefault(track.get('label') or 'en', []).append({
3426 'url': self._proto_relative_url(track_url)
3427 })
3428
3429 entry = {
3430 'id': this_video_id,
3431 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3432 'description': clean_html(video_data.get('description')),
3433 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3434 'timestamp': int_or_none(video_data.get('pubdate')),
3435 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3436 'subtitles': subtitles,
3437 }
3438 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3439 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3440 entry.update({
3441 '_type': 'url_transparent',
3442 'url': formats[0]['url'],
3443 })
3444 else:
3445 self._sort_formats(formats)
3446 entry['formats'] = formats
3447 entries.append(entry)
3448 if len(entries) == 1:
3449 return entries[0]
3450 else:
3451 return self.playlist_result(entries)
3452
3453 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3454 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3455 urls = []
3456 formats = []
3457 for source in jwplayer_sources_data:
3458 if not isinstance(source, dict):
3459 continue
3460 source_url = urljoin(
3461 base_url, self._proto_relative_url(source.get('file')))
3462 if not source_url or source_url in urls:
3463 continue
3464 urls.append(source_url)
3465 source_type = source.get('type') or ''
3466 ext = mimetype2ext(source_type) or determine_ext(source_url)
3467 if source_type == 'hls' or ext == 'm3u8':
3468 formats.extend(self._extract_m3u8_formats(
3469 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3470 m3u8_id=m3u8_id, fatal=False))
3471 elif source_type == 'dash' or ext == 'mpd':
3472 formats.extend(self._extract_mpd_formats(
3473 source_url, video_id, mpd_id=mpd_id, fatal=False))
3474 elif ext == 'smil':
3475 formats.extend(self._extract_smil_formats(
3476 source_url, video_id, fatal=False))
3477 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3478 elif source_type.startswith('audio') or ext in (
3479 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3480 formats.append({
3481 'url': source_url,
3482 'vcodec': 'none',
3483 'ext': ext,
3484 })
3485 else:
3486 height = int_or_none(source.get('height'))
3487 if height is None:
3488 # Often no height is provided but there is a label in
3489 # format like "1080p", "720p SD", or 1080.
3490 height = int_or_none(self._search_regex(
3491 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3492 'height', default=None))
3493 a_format = {
3494 'url': source_url,
3495 'width': int_or_none(source.get('width')),
3496 'height': height,
3497 'tbr': int_or_none(source.get('bitrate')),
3498 'ext': ext,
3499 }
3500 if source_url.startswith('rtmp'):
3501 a_format['ext'] = 'flv'
3502 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3503 # of jwplayer.flash.swf
3504 rtmp_url_parts = re.split(
3505 r'((?:mp4|mp3|flv):)', source_url, 1)
3506 if len(rtmp_url_parts) == 3:
3507 rtmp_url, prefix, play_path = rtmp_url_parts
3508 a_format.update({
3509 'url': rtmp_url,
3510 'play_path': prefix + play_path,
3511 })
3512 if rtmp_params:
3513 a_format.update(rtmp_params)
3514 formats.append(a_format)
3515 return formats
3516
3517 def _live_title(self, name):
3518 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3519 return name
3520
3521 def _int(self, v, name, fatal=False, **kwargs):
3522 res = int_or_none(v, **kwargs)
3523 if res is None:
3524 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3525 if fatal:
3526 raise ExtractorError(msg)
3527 else:
3528 self.report_warning(msg)
3529 return res
3530
3531 def _float(self, v, name, fatal=False, **kwargs):
3532 res = float_or_none(v, **kwargs)
3533 if res is None:
3534 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3535 if fatal:
3536 raise ExtractorError(msg)
3537 else:
3538 self.report_warning(msg)
3539 return res
3540
3541 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3542 path='/', secure=False, discard=False, rest={}, **kwargs):
3543 cookie = compat_cookiejar_Cookie(
3544 0, name, value, port, port is not None, domain, True,
3545 domain.startswith('.'), path, True, secure, expire_time,
3546 discard, None, None, rest)
3547 self._downloader.cookiejar.set_cookie(cookie)
3548
3549 def _get_cookies(self, url):
3550 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3551 req = sanitized_Request(url)
3552 self._downloader.cookiejar.add_cookie_header(req)
3553 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3554
3555 def _apply_first_set_cookie_header(self, url_handle, cookie):
3556 """
3557 Apply first Set-Cookie header instead of the last. Experimental.
3558
3559 Some sites (e.g. [1-3]) may serve two cookies under the same name
3560 in Set-Cookie header and expect the first (old) one to be set rather
3561 than second (new). However, as of RFC6265 the newer one cookie
3562 should be set into cookie store what actually happens.
3563 We will workaround this issue by resetting the cookie to
3564 the first one manually.
3565 1. https://new.vk.com/
3566 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3567 3. https://learning.oreilly.com/
3568 """
3569 for header, cookies in url_handle.headers.items():
3570 if header.lower() != 'set-cookie':
3571 continue
3572 if sys.version_info[0] >= 3:
3573 cookies = cookies.encode('iso-8859-1')
3574 cookies = cookies.decode('utf-8')
3575 cookie_value = re.search(
3576 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3577 if cookie_value:
3578 value, domain = cookie_value.groups()
3579 self._set_cookie(domain, cookie, value)
3580 break
3581
3582 def get_testcases(self, include_onlymatching=False):
3583 t = getattr(self, '_TEST', None)
3584 if t:
3585 assert not hasattr(self, '_TESTS'), \
3586 '%s has _TEST and _TESTS' % type(self).__name__
3587 tests = [t]
3588 else:
3589 tests = getattr(self, '_TESTS', [])
3590 for t in tests:
3591 if not include_onlymatching and t.get('only_matching', False):
3592 continue
3593 t['name'] = type(self).__name__[:-len('IE')]
3594 yield t
3595
3596 def is_suitable(self, age_limit):
3597 """ Test whether the extractor is generally suitable for the given
3598 age limit (i.e. pornographic sites are not, all others usually are) """
3599
3600 any_restricted = False
3601 for tc in self.get_testcases(include_onlymatching=False):
3602 if tc.get('playlist', []):
3603 tc = tc['playlist'][0]
3604 is_restricted = age_restricted(
3605 tc.get('info_dict', {}).get('age_limit'), age_limit)
3606 if not is_restricted:
3607 return True
3608 any_restricted = any_restricted or is_restricted
3609 return not any_restricted
3610
3611 def extract_subtitles(self, *args, **kwargs):
3612 if (self.get_param('writesubtitles', False)
3613 or self.get_param('listsubtitles')):
3614 return self._get_subtitles(*args, **kwargs)
3615 return {}
3616
3617 def _get_subtitles(self, *args, **kwargs):
3618 raise NotImplementedError('This method must be implemented by subclasses')
3619
3620 def extract_comments(self, *args, **kwargs):
3621 if not self.get_param('getcomments'):
3622 return None
3623 generator = self._get_comments(*args, **kwargs)
3624
3625 def extractor():
3626 comments = []
3627 interrupted = True
3628 try:
3629 while True:
3630 comments.append(next(generator))
3631 except StopIteration:
3632 interrupted = False
3633 except KeyboardInterrupt:
3634 self.to_screen('Interrupted by user')
3635 except Exception as e:
3636 if self.get_param('ignoreerrors') is not True:
3637 raise
3638 self._downloader.report_error(e)
3639 comment_count = len(comments)
3640 self.to_screen(f'Extracted {comment_count} comments')
3641 return {
3642 'comments': comments,
3643 'comment_count': None if interrupted else comment_count
3644 }
3645 return extractor
3646
3647 def _get_comments(self, *args, **kwargs):
3648 raise NotImplementedError('This method must be implemented by subclasses')
3649
3650 @staticmethod
3651 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3652 """ Merge subtitle items for one language. Items with duplicated URLs
3653 will be dropped. """
3654 list1_urls = set([item['url'] for item in subtitle_list1])
3655 ret = list(subtitle_list1)
3656 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3657 return ret
3658
3659 @classmethod
3660 def _merge_subtitles(cls, *dicts, target=None):
3661 """ Merge subtitle dictionaries, language by language. """
3662 if target is None:
3663 target = {}
3664 for d in dicts:
3665 for lang, subs in d.items():
3666 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3667 return target
3668
3669 def extract_automatic_captions(self, *args, **kwargs):
3670 if (self.get_param('writeautomaticsub', False)
3671 or self.get_param('listsubtitles')):
3672 return self._get_automatic_captions(*args, **kwargs)
3673 return {}
3674
3675 def _get_automatic_captions(self, *args, **kwargs):
3676 raise NotImplementedError('This method must be implemented by subclasses')
3677
3678 def mark_watched(self, *args, **kwargs):
3679 if not self.get_param('mark_watched', False):
3680 return
3681 if (hasattr(self, '_NETRC_MACHINE') and self._get_login_info()[0] is not None
3682 or self.get_param('cookiefile')
3683 or self.get_param('cookiesfrombrowser')):
3684 self._mark_watched(*args, **kwargs)
3685
3686 def _mark_watched(self, *args, **kwargs):
3687 raise NotImplementedError('This method must be implemented by subclasses')
3688
3689 def geo_verification_headers(self):
3690 headers = {}
3691 geo_verification_proxy = self.get_param('geo_verification_proxy')
3692 if geo_verification_proxy:
3693 headers['Ytdl-request-proxy'] = geo_verification_proxy
3694 return headers
3695
3696 def _generic_id(self, url):
3697 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3698
3699 def _generic_title(self, url):
3700 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3701
3702 @staticmethod
3703 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3704 all_known = all(map(
3705 lambda x: x is not None,
3706 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3707 return (
3708 'private' if is_private
3709 else 'premium_only' if needs_premium
3710 else 'subscriber_only' if needs_subscription
3711 else 'needs_auth' if needs_auth
3712 else 'unlisted' if is_unlisted
3713 else 'public' if all_known
3714 else None)
3715
3716 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3717 '''
3718 @returns A list of values for the extractor argument given by "key"
3719 or "default" if no such key is present
3720 @param default The default value to return when the key is not present (default: [])
3721 @param casesense When false, the values are converted to lower case
3722 '''
3723 val = traverse_obj(
3724 self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3725 if val is None:
3726 return [] if default is NO_DEFAULT else default
3727 return list(val) if casesense else [x.lower() for x in val]
3728
3729 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3730 if not playlist_id or not video_id:
3731 return not video_id
3732
3733 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3734 if no_playlist is not None:
3735 return not no_playlist
3736
3737 video_id = '' if video_id is True else f' {video_id}'
3738 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3739 if self.get_param('noplaylist'):
3740 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3741 return False
3742 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3743 return True
3744
3745
3746 class SearchInfoExtractor(InfoExtractor):
3747 """
3748 Base class for paged search queries extractors.
3749 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3750 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3751 """
3752
3753 _MAX_RESULTS = float('inf')
3754
3755 @classmethod
3756 def _make_valid_url(cls):
3757 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3758
3759 def _real_extract(self, query):
3760 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3761 if prefix == '':
3762 return self._get_n_results(query, 1)
3763 elif prefix == 'all':
3764 return self._get_n_results(query, self._MAX_RESULTS)
3765 else:
3766 n = int(prefix)
3767 if n <= 0:
3768 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3769 elif n > self._MAX_RESULTS:
3770 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3771 n = self._MAX_RESULTS
3772 return self._get_n_results(query, n)
3773
3774 def _get_n_results(self, query, n):
3775 """Get a specified number of results for a query.
3776 Either this function or _search_results must be overridden by subclasses """
3777 return self.playlist_result(
3778 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3779 query, query)
3780
3781 def _search_results(self, query):
3782 """Returns an iterator of search results"""
3783 raise NotImplementedError('This method must be implemented by subclasses')
3784
3785 @property
3786 def SEARCH_KEY(self):
3787 return self._SEARCH_KEY