]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[digitalconcerthall] Add extractor (#1931)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import collections
6 import hashlib
7 import itertools
8 import json
9 import netrc
10 import os
11 import random
12 import re
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar_Cookie,
19 compat_cookies_SimpleCookie,
20 compat_etree_Element,
21 compat_etree_fromstring,
22 compat_expanduser,
23 compat_getpass,
24 compat_http_client,
25 compat_os_name,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
31 compat_urlparse,
32 compat_xml_parse_error,
33 )
34 from ..downloader import FileDownloader
35 from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38 )
39 from ..utils import (
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 dict_get,
48 error_to_compat_str,
49 extract_attributes,
50 ExtractorError,
51 fix_xml_ampersands,
52 float_or_none,
53 format_field,
54 GeoRestrictedError,
55 GeoUtils,
56 int_or_none,
57 join_nonempty,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 network_exceptions,
62 NO_DEFAULT,
63 orderedSet,
64 parse_bitrate,
65 parse_codecs,
66 parse_duration,
67 parse_iso8601,
68 parse_m3u8_attributes,
69 parse_resolution,
70 RegexNotFoundError,
71 sanitize_filename,
72 sanitized_Request,
73 str_or_none,
74 str_to_int,
75 strip_or_none,
76 traverse_obj,
77 unescapeHTML,
78 UnsupportedError,
79 unified_strdate,
80 unified_timestamp,
81 update_Request,
82 update_url_query,
83 url_basename,
84 url_or_none,
85 urljoin,
86 variadic,
87 xpath_element,
88 xpath_text,
89 xpath_with_ns,
90 )
91
92
93 class InfoExtractor(object):
94 """Information Extractor class.
95
96 Information extractors are the classes that, given a URL, extract
97 information about the video (or videos) the URL refers to. This
98 information includes the real video URL, the video title, author and
99 others. The information is stored in a dictionary which is then
100 passed to the YoutubeDL. The YoutubeDL processes this
101 information possibly downloading the video to the file system, among
102 other possible outcomes.
103
104 The type field determines the type of the result.
105 By far the most common value (and the default if _type is missing) is
106 "video", which indicates a single video.
107
108 For a video, the dictionaries must include the following fields:
109
110 id: Video identifier.
111 title: Video title, unescaped.
112
113 Additionally, it must contain either a formats entry or a url one:
114
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
129 is parsed from a string (in case of
130 fragmented media)
131 for MSS - URL of the ISM manifest.
132 * manifest_url
133 The URL of the manifest file in case of
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
139 * ext Will be calculated from URL if missing
140 * format A human-readable description of the format
141 ("mp4 container with h264/opus").
142 Calculated from the format_id, width, height.
143 and format_note fields if missing.
144 * format_id A short description of the format
145 ("mp4_h264_opus" or "19").
146 Technically optional, but strongly recommended.
147 * format_note Additional info about the format
148 ("3D" or "DASH video")
149 * width Width of the video, if known
150 * height Height of the video, if known
151 * resolution Textual description of width and height
152 * dynamic_range The dynamic range of the video. One of:
153 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
154 * tbr Average bitrate of audio and video in KBit/s
155 * abr Average audio bitrate in KBit/s
156 * acodec Name of the audio codec in use
157 * asr Audio sampling rate in Hertz
158 * vbr Average video bitrate in KBit/s
159 * fps Frame rate
160 * vcodec Name of the video codec in use
161 * container Name of the container format
162 * filesize The number of bytes, if known in advance
163 * filesize_approx An estimate for the number of bytes
164 * player_url SWF Player URL (used for rtmpdump).
165 * protocol The protocol that will be used for the actual
166 download, lower-case. One of "http", "https" or
167 one of the protocols defined in downloader.PROTOCOL_MAP
168 * fragment_base_url
169 Base URL for fragments. Each fragment's path
170 value (if present) will be relative to
171 this URL.
172 * fragments A list of fragments of a fragmented media.
173 Each fragment entry must contain either an url
174 or a path. If an url is present it should be
175 considered by a client. Otherwise both path and
176 fragment_base_url must be present. Here is
177 the list of all potential fields:
178 * "url" - fragment's URL
179 * "path" - fragment's path relative to
180 fragment_base_url
181 * "duration" (optional, int or float)
182 * "filesize" (optional, int)
183 * is_from_start Is a live format that can be downloaded
184 from the start. Boolean
185 * preference Order number of this format. If this field is
186 present and not None, the formats get sorted
187 by this field, regardless of all other values.
188 -1 for default (order by other properties),
189 -2 or smaller for less than default.
190 < -1000 to hide the format (if there is
191 another one which is strictly better)
192 * language Language code, e.g. "de" or "en-US".
193 * language_preference Is this in the language mentioned in
194 the URL?
195 10 if it's what the URL is about,
196 -1 for default (don't know),
197 -10 otherwise, other values reserved for now.
198 * quality Order number of the video quality of this
199 format, irrespective of the file format.
200 -1 for default (order by other properties),
201 -2 or smaller for less than default.
202 * source_preference Order number for this video source
203 (quality takes higher priority)
204 -1 for default (order by other properties),
205 -2 or smaller for less than default.
206 * http_headers A dictionary of additional HTTP headers
207 to add to the request.
208 * stretched_ratio If given and not 1, indicates that the
209 video's pixels are not square.
210 width : height ratio as float.
211 * no_resume The server does not support resuming the
212 (HTTP or RTMP) download. Boolean.
213 * has_drm The format has DRM and cannot be downloaded. Boolean
214 * downloader_options A dictionary of downloader options as
215 described in FileDownloader
216 RTMP formats can also have the additional fields: page_url,
217 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
218 rtmp_protocol, rtmp_real_time
219
220 url: Final video URL.
221 ext: Video filename extension.
222 format: The video format, defaults to ext (used for --get-format)
223 player_url: SWF Player URL (used for rtmpdump).
224
225 The following fields are optional:
226
227 alt_title: A secondary title of the video.
228 display_id An alternative identifier for the video, not necessarily
229 unique, but available before title. Typically, id is
230 something like "4234987", title "Dancing naked mole rats",
231 and display_id "dancing-naked-mole-rats"
232 thumbnails: A list of dictionaries, with the following entries:
233 * "id" (optional, string) - Thumbnail format ID
234 * "url"
235 * "preference" (optional, int) - quality of the image
236 * "width" (optional, int)
237 * "height" (optional, int)
238 * "resolution" (optional, string "{width}x{height}",
239 deprecated)
240 * "filesize" (optional, int)
241 thumbnail: Full URL to a video thumbnail image.
242 description: Full video description.
243 uploader: Full name of the video uploader.
244 license: License name the video is licensed under.
245 creator: The creator of the video.
246 timestamp: UNIX timestamp of the moment the video was uploaded
247 upload_date: Video upload date (YYYYMMDD).
248 If not explicitly set, calculated from timestamp
249 release_timestamp: UNIX timestamp of the moment the video was released.
250 If it is not clear whether to use timestamp or this, use the former
251 release_date: The date (YYYYMMDD) when the video was released.
252 If not explicitly set, calculated from release_timestamp
253 modified_timestamp: UNIX timestamp of the moment the video was last modified.
254 modified_date: The date (YYYYMMDD) when the video was last modified.
255 If not explicitly set, calculated from modified_timestamp
256 uploader_id: Nickname or id of the video uploader.
257 uploader_url: Full URL to a personal webpage of the video uploader.
258 channel: Full name of the channel the video is uploaded on.
259 Note that channel fields may or may not repeat uploader
260 fields. This depends on a particular extractor.
261 channel_id: Id of the channel.
262 channel_url: Full URL to a channel webpage.
263 location: Physical location where the video was filmed.
264 subtitles: The available subtitles as a dictionary in the format
265 {tag: subformats}. "tag" is usually a language code, and
266 "subformats" is a list sorted from lower to higher
267 preference, each element is a dictionary with the "ext"
268 entry and one of:
269 * "data": The subtitles file contents
270 * "url": A URL pointing to the subtitles file
271 It can optionally also have:
272 * "name": Name or description of the subtitles
273 "ext" will be calculated from URL if missing
274 automatic_captions: Like 'subtitles'; contains automatically generated
275 captions instead of normal subtitles
276 duration: Length of the video in seconds, as an integer or float.
277 view_count: How many users have watched the video on the platform.
278 like_count: Number of positive ratings of the video
279 dislike_count: Number of negative ratings of the video
280 repost_count: Number of reposts of the video
281 average_rating: Average rating give by users, the scale used depends on the webpage
282 comment_count: Number of comments on the video
283 comments: A list of comments, each with one or more of the following
284 properties (all but one of text or html optional):
285 * "author" - human-readable name of the comment author
286 * "author_id" - user ID of the comment author
287 * "author_thumbnail" - The thumbnail of the comment author
288 * "id" - Comment ID
289 * "html" - Comment as HTML
290 * "text" - Plain text of the comment
291 * "timestamp" - UNIX timestamp of comment
292 * "parent" - ID of the comment this one is replying to.
293 Set to "root" to indicate that this is a
294 comment to the original video.
295 * "like_count" - Number of positive ratings of the comment
296 * "dislike_count" - Number of negative ratings of the comment
297 * "is_favorited" - Whether the comment is marked as
298 favorite by the video uploader
299 * "author_is_uploader" - Whether the comment is made by
300 the video uploader
301 age_limit: Age restriction for the video, as an integer (years)
302 webpage_url: The URL to the video webpage, if given to yt-dlp it
303 should allow to get the same result again. (It will be set
304 by YoutubeDL if it's missing)
305 categories: A list of categories that the video falls in, for example
306 ["Sports", "Berlin"]
307 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
308 cast: A list of the video cast
309 is_live: True, False, or None (=unknown). Whether this video is a
310 live stream that goes on instead of a fixed-length video.
311 was_live: True, False, or None (=unknown). Whether this video was
312 originally a live stream.
313 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
314 If absent, automatically set from is_live, was_live
315 start_time: Time in seconds where the reproduction should start, as
316 specified in the URL.
317 end_time: Time in seconds where the reproduction should end, as
318 specified in the URL.
319 chapters: A list of dictionaries, with the following entries:
320 * "start_time" - The start time of the chapter in seconds
321 * "end_time" - The end time of the chapter in seconds
322 * "title" (optional, string)
323 playable_in_embed: Whether this video is allowed to play in embedded
324 players on other sites. Can be True (=always allowed),
325 False (=never allowed), None (=unknown), or a string
326 specifying the criteria for embedability (Eg: 'whitelist')
327 availability: Under what condition the video is available. One of
328 'private', 'premium_only', 'subscriber_only', 'needs_auth',
329 'unlisted' or 'public'. Use 'InfoExtractor._availability'
330 to set it
331 __post_extractor: A function to be called just before the metadata is
332 written to either disk, logger or console. The function
333 must return a dict which will be added to the info_dict.
334 This is usefull for additional information that is
335 time-consuming to extract. Note that the fields thus
336 extracted will not be available to output template and
337 match_filter. So, only "comments" and "comment_count" are
338 currently allowed to be extracted via this method.
339
340 The following fields should only be used when the video belongs to some logical
341 chapter or section:
342
343 chapter: Name or title of the chapter the video belongs to.
344 chapter_number: Number of the chapter the video belongs to, as an integer.
345 chapter_id: Id of the chapter the video belongs to, as a unicode string.
346
347 The following fields should only be used when the video is an episode of some
348 series, programme or podcast:
349
350 series: Title of the series or programme the video episode belongs to.
351 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
352 season: Title of the season the video episode belongs to.
353 season_number: Number of the season the video episode belongs to, as an integer.
354 season_id: Id of the season the video episode belongs to, as a unicode string.
355 episode: Title of the video episode. Unlike mandatory video title field,
356 this field should denote the exact title of the video episode
357 without any kind of decoration.
358 episode_number: Number of the video episode within a season, as an integer.
359 episode_id: Id of the video episode, as a unicode string.
360
361 The following fields should only be used when the media is a track or a part of
362 a music album:
363
364 track: Title of the track.
365 track_number: Number of the track within an album or a disc, as an integer.
366 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
367 as a unicode string.
368 artist: Artist(s) of the track.
369 genre: Genre(s) of the track.
370 album: Title of the album the track belongs to.
371 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
372 album_artist: List of all artists appeared on the album (e.g.
373 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
374 and compilations).
375 disc_number: Number of the disc or other physical medium the track belongs to,
376 as an integer.
377 release_year: Year (YYYY) when the album was released.
378 composer: Composer of the piece
379
380 Unless mentioned otherwise, the fields should be Unicode strings.
381
382 Unless mentioned otherwise, None is equivalent to absence of information.
383
384
385 _type "playlist" indicates multiple videos.
386 There must be a key "entries", which is a list, an iterable, or a PagedList
387 object, each element of which is a valid dictionary by this specification.
388
389 Additionally, playlists can have "id", "title", and any other relevent
390 attributes with the same semantics as videos (see above).
391
392 It can also have the following optional fields:
393
394 playlist_count: The total number of videos in a playlist. If not given,
395 YoutubeDL tries to calculate it from "entries"
396
397
398 _type "multi_video" indicates that there are multiple videos that
399 form a single show, for examples multiple acts of an opera or TV episode.
400 It must have an entries key like a playlist and contain all the keys
401 required for a video at the same time.
402
403
404 _type "url" indicates that the video must be extracted from another
405 location, possibly by a different extractor. Its only required key is:
406 "url" - the next URL to extract.
407 The key "ie_key" can be set to the class name (minus the trailing "IE",
408 e.g. "Youtube") if the extractor class is known in advance.
409 Additionally, the dictionary may have any properties of the resolved entity
410 known in advance, for example "title" if the title of the referred video is
411 known ahead of time.
412
413
414 _type "url_transparent" entities have the same specification as "url", but
415 indicate that the given additional information is more precise than the one
416 associated with the resolved URL.
417 This is useful when a site employs a video service that hosts the video and
418 its technical metadata, but that video service does not embed a useful
419 title, description etc.
420
421
422 Subclasses of this one should re-define the _real_initialize() and
423 _real_extract() methods and define a _VALID_URL regexp.
424 Probably, they should also be added to the list of extractors.
425
426 Subclasses may also override suitable() if necessary, but ensure the function
427 signature is preserved and that this function imports everything it needs
428 (except other extractors), so that lazy_extractors works correctly
429
430 _GEO_BYPASS attribute may be set to False in order to disable
431 geo restriction bypass mechanisms for a particular extractor.
432 Though it won't disable explicit geo restriction bypass based on
433 country code provided with geo_bypass_country.
434
435 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
436 countries for this extractor. One of these countries will be used by
437 geo restriction bypass mechanism right away in order to bypass
438 geo restriction, of course, if the mechanism is not disabled.
439
440 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
441 IP blocks in CIDR notation for this extractor. One of these IP blocks
442 will be used by geo restriction bypass mechanism similarly
443 to _GEO_COUNTRIES.
444
445 The _WORKING attribute should be set to False for broken IEs
446 in order to warn the users and skip the tests.
447 """
448
449 _ready = False
450 _downloader = None
451 _x_forwarded_for_ip = None
452 _GEO_BYPASS = True
453 _GEO_COUNTRIES = None
454 _GEO_IP_BLOCKS = None
455 _WORKING = True
456
457 _LOGIN_HINTS = {
458 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
459 'cookies': (
460 'Use --cookies-from-browser or --cookies for the authentication. '
461 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
462 'password': 'Use --username and --password, or --netrc to provide account credentials',
463 }
464
465 def __init__(self, downloader=None):
466 """Constructor. Receives an optional downloader (a YoutubeDL instance).
467 If a downloader is not passed during initialization,
468 it must be set using "set_downloader()" before "extract()" is called"""
469 self._ready = False
470 self._x_forwarded_for_ip = None
471 self._printed_messages = set()
472 self.set_downloader(downloader)
473
474 @classmethod
475 def _match_valid_url(cls, url):
476 # This does not use has/getattr intentionally - we want to know whether
477 # we have cached the regexp for *this* class, whereas getattr would also
478 # match the superclass
479 if '_VALID_URL_RE' not in cls.__dict__:
480 if '_VALID_URL' not in cls.__dict__:
481 cls._VALID_URL = cls._make_valid_url()
482 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
483 return cls._VALID_URL_RE.match(url)
484
485 @classmethod
486 def suitable(cls, url):
487 """Receives a URL and returns True if suitable for this IE."""
488 # This function must import everything it needs (except other extractors),
489 # so that lazy_extractors works correctly
490 return cls._match_valid_url(url) is not None
491
492 @classmethod
493 def _match_id(cls, url):
494 return cls._match_valid_url(url).group('id')
495
496 @classmethod
497 def get_temp_id(cls, url):
498 try:
499 return cls._match_id(url)
500 except (IndexError, AttributeError):
501 return None
502
503 @classmethod
504 def working(cls):
505 """Getter method for _WORKING."""
506 return cls._WORKING
507
508 def initialize(self):
509 """Initializes an instance (authentication, etc)."""
510 self._printed_messages = set()
511 self._initialize_geo_bypass({
512 'countries': self._GEO_COUNTRIES,
513 'ip_blocks': self._GEO_IP_BLOCKS,
514 })
515 if not self._ready:
516 self._real_initialize()
517 self._ready = True
518
519 def _initialize_geo_bypass(self, geo_bypass_context):
520 """
521 Initialize geo restriction bypass mechanism.
522
523 This method is used to initialize geo bypass mechanism based on faking
524 X-Forwarded-For HTTP header. A random country from provided country list
525 is selected and a random IP belonging to this country is generated. This
526 IP will be passed as X-Forwarded-For HTTP header in all subsequent
527 HTTP requests.
528
529 This method will be used for initial geo bypass mechanism initialization
530 during the instance initialization with _GEO_COUNTRIES and
531 _GEO_IP_BLOCKS.
532
533 You may also manually call it from extractor's code if geo bypass
534 information is not available beforehand (e.g. obtained during
535 extraction) or due to some other reason. In this case you should pass
536 this information in geo bypass context passed as first argument. It may
537 contain following fields:
538
539 countries: List of geo unrestricted countries (similar
540 to _GEO_COUNTRIES)
541 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
542 (similar to _GEO_IP_BLOCKS)
543
544 """
545 if not self._x_forwarded_for_ip:
546
547 # Geo bypass mechanism is explicitly disabled by user
548 if not self.get_param('geo_bypass', True):
549 return
550
551 if not geo_bypass_context:
552 geo_bypass_context = {}
553
554 # Backward compatibility: previously _initialize_geo_bypass
555 # expected a list of countries, some 3rd party code may still use
556 # it this way
557 if isinstance(geo_bypass_context, (list, tuple)):
558 geo_bypass_context = {
559 'countries': geo_bypass_context,
560 }
561
562 # The whole point of geo bypass mechanism is to fake IP
563 # as X-Forwarded-For HTTP header based on some IP block or
564 # country code.
565
566 # Path 1: bypassing based on IP block in CIDR notation
567
568 # Explicit IP block specified by user, use it right away
569 # regardless of whether extractor is geo bypassable or not
570 ip_block = self.get_param('geo_bypass_ip_block', None)
571
572 # Otherwise use random IP block from geo bypass context but only
573 # if extractor is known as geo bypassable
574 if not ip_block:
575 ip_blocks = geo_bypass_context.get('ip_blocks')
576 if self._GEO_BYPASS and ip_blocks:
577 ip_block = random.choice(ip_blocks)
578
579 if ip_block:
580 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
581 self._downloader.write_debug(
582 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
583 return
584
585 # Path 2: bypassing based on country code
586
587 # Explicit country code specified by user, use it right away
588 # regardless of whether extractor is geo bypassable or not
589 country = self.get_param('geo_bypass_country', None)
590
591 # Otherwise use random country code from geo bypass context but
592 # only if extractor is known as geo bypassable
593 if not country:
594 countries = geo_bypass_context.get('countries')
595 if self._GEO_BYPASS and countries:
596 country = random.choice(countries)
597
598 if country:
599 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
600 self._downloader.write_debug(
601 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
602
603 def extract(self, url):
604 """Extracts URL information and returns it in list of dicts."""
605 try:
606 for _ in range(2):
607 try:
608 self.initialize()
609 self.write_debug('Extracting URL: %s' % url)
610 ie_result = self._real_extract(url)
611 if ie_result is None:
612 return None
613 if self._x_forwarded_for_ip:
614 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
615 subtitles = ie_result.get('subtitles')
616 if (subtitles and 'live_chat' in subtitles
617 and 'no-live-chat' in self.get_param('compat_opts', [])):
618 del subtitles['live_chat']
619 return ie_result
620 except GeoRestrictedError as e:
621 if self.__maybe_fake_ip_and_retry(e.countries):
622 continue
623 raise
624 except UnsupportedError:
625 raise
626 except ExtractorError as e:
627 kwargs = {
628 'video_id': e.video_id or self.get_temp_id(url),
629 'ie': self.IE_NAME,
630 'tb': e.traceback or sys.exc_info()[2],
631 'expected': e.expected,
632 'cause': e.cause
633 }
634 if hasattr(e, 'countries'):
635 kwargs['countries'] = e.countries
636 raise type(e)(e.msg, **kwargs)
637 except compat_http_client.IncompleteRead as e:
638 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
639 except (KeyError, StopIteration) as e:
640 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
641
642 def __maybe_fake_ip_and_retry(self, countries):
643 if (not self.get_param('geo_bypass_country', None)
644 and self._GEO_BYPASS
645 and self.get_param('geo_bypass', True)
646 and not self._x_forwarded_for_ip
647 and countries):
648 country_code = random.choice(countries)
649 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
650 if self._x_forwarded_for_ip:
651 self.report_warning(
652 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
653 % (self._x_forwarded_for_ip, country_code.upper()))
654 return True
655 return False
656
657 def set_downloader(self, downloader):
658 """Sets the downloader for this IE."""
659 self._downloader = downloader
660
661 def _real_initialize(self):
662 """Real initialization process. Redefine in subclasses."""
663 pass
664
665 def _real_extract(self, url):
666 """Real extraction process. Redefine in subclasses."""
667 pass
668
669 @classmethod
670 def ie_key(cls):
671 """A string for getting the InfoExtractor with get_info_extractor"""
672 return cls.__name__[:-2]
673
674 @property
675 def IE_NAME(self):
676 return compat_str(type(self).__name__[:-2])
677
678 @staticmethod
679 def __can_accept_status_code(err, expected_status):
680 assert isinstance(err, compat_urllib_error.HTTPError)
681 if expected_status is None:
682 return False
683 elif callable(expected_status):
684 return expected_status(err.code) is True
685 else:
686 return err.code in variadic(expected_status)
687
688 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
689 """
690 Return the response handle.
691
692 See _download_webpage docstring for arguments specification.
693 """
694 if not self._downloader._first_webpage_request:
695 sleep_interval = self.get_param('sleep_interval_requests') or 0
696 if sleep_interval > 0:
697 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
698 time.sleep(sleep_interval)
699 else:
700 self._downloader._first_webpage_request = False
701
702 if note is None:
703 self.report_download_webpage(video_id)
704 elif note is not False:
705 if video_id is None:
706 self.to_screen('%s' % (note,))
707 else:
708 self.to_screen('%s: %s' % (video_id, note))
709
710 # Some sites check X-Forwarded-For HTTP header in order to figure out
711 # the origin of the client behind proxy. This allows bypassing geo
712 # restriction by faking this header's value to IP that belongs to some
713 # geo unrestricted country. We will do so once we encounter any
714 # geo restriction error.
715 if self._x_forwarded_for_ip:
716 if 'X-Forwarded-For' not in headers:
717 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
718
719 if isinstance(url_or_request, compat_urllib_request.Request):
720 url_or_request = update_Request(
721 url_or_request, data=data, headers=headers, query=query)
722 else:
723 if query:
724 url_or_request = update_url_query(url_or_request, query)
725 if data is not None or headers:
726 url_or_request = sanitized_Request(url_or_request, data, headers)
727 try:
728 return self._downloader.urlopen(url_or_request)
729 except network_exceptions as err:
730 if isinstance(err, compat_urllib_error.HTTPError):
731 if self.__can_accept_status_code(err, expected_status):
732 # Retain reference to error to prevent file object from
733 # being closed before it can be read. Works around the
734 # effects of <https://bugs.python.org/issue15002>
735 # introduced in Python 3.4.1.
736 err.fp._error = err
737 return err.fp
738
739 if errnote is False:
740 return False
741 if errnote is None:
742 errnote = 'Unable to download webpage'
743
744 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
745 if fatal:
746 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
747 else:
748 self.report_warning(errmsg)
749 return False
750
751 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
752 """
753 Return a tuple (page content as string, URL handle).
754
755 See _download_webpage docstring for arguments specification.
756 """
757 # Strip hashes from the URL (#1038)
758 if isinstance(url_or_request, (compat_str, str)):
759 url_or_request = url_or_request.partition('#')[0]
760
761 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
762 if urlh is False:
763 assert not fatal
764 return False
765 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
766 return (content, urlh)
767
768 @staticmethod
769 def _guess_encoding_from_content(content_type, webpage_bytes):
770 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
771 if m:
772 encoding = m.group(1)
773 else:
774 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
775 webpage_bytes[:1024])
776 if m:
777 encoding = m.group(1).decode('ascii')
778 elif webpage_bytes.startswith(b'\xff\xfe'):
779 encoding = 'utf-16'
780 else:
781 encoding = 'utf-8'
782
783 return encoding
784
785 def __check_blocked(self, content):
786 first_block = content[:512]
787 if ('<title>Access to this site is blocked</title>' in content
788 and 'Websense' in first_block):
789 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
790 blocked_iframe = self._html_search_regex(
791 r'<iframe src="([^"]+)"', content,
792 'Websense information URL', default=None)
793 if blocked_iframe:
794 msg += ' Visit %s for more details' % blocked_iframe
795 raise ExtractorError(msg, expected=True)
796 if '<title>The URL you requested has been blocked</title>' in first_block:
797 msg = (
798 'Access to this webpage has been blocked by Indian censorship. '
799 'Use a VPN or proxy server (with --proxy) to route around it.')
800 block_msg = self._html_search_regex(
801 r'</h1><p>(.*?)</p>',
802 content, 'block message', default=None)
803 if block_msg:
804 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
805 raise ExtractorError(msg, expected=True)
806 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
807 and 'blocklist.rkn.gov.ru' in content):
808 raise ExtractorError(
809 'Access to this webpage has been blocked by decision of the Russian government. '
810 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
811 expected=True)
812
813 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
814 content_type = urlh.headers.get('Content-Type', '')
815 webpage_bytes = urlh.read()
816 if prefix is not None:
817 webpage_bytes = prefix + webpage_bytes
818 if not encoding:
819 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
820 if self.get_param('dump_intermediate_pages', False):
821 self.to_screen('Dumping request to ' + urlh.geturl())
822 dump = base64.b64encode(webpage_bytes).decode('ascii')
823 self._downloader.to_screen(dump)
824 if self.get_param('write_pages', False):
825 basen = '%s_%s' % (video_id, urlh.geturl())
826 trim_length = self.get_param('trim_file_name') or 240
827 if len(basen) > trim_length:
828 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
829 basen = basen[:trim_length - len(h)] + h
830 raw_filename = basen + '.dump'
831 filename = sanitize_filename(raw_filename, restricted=True)
832 self.to_screen('Saving request to ' + filename)
833 # Working around MAX_PATH limitation on Windows (see
834 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
835 if compat_os_name == 'nt':
836 absfilepath = os.path.abspath(filename)
837 if len(absfilepath) > 259:
838 filename = '\\\\?\\' + absfilepath
839 with open(filename, 'wb') as outf:
840 outf.write(webpage_bytes)
841
842 try:
843 content = webpage_bytes.decode(encoding, 'replace')
844 except LookupError:
845 content = webpage_bytes.decode('utf-8', 'replace')
846
847 self.__check_blocked(content)
848
849 return content
850
851 def _download_webpage(
852 self, url_or_request, video_id, note=None, errnote=None,
853 fatal=True, tries=1, timeout=5, encoding=None, data=None,
854 headers={}, query={}, expected_status=None):
855 """
856 Return the data of the page as a string.
857
858 Arguments:
859 url_or_request -- plain text URL as a string or
860 a compat_urllib_request.Requestobject
861 video_id -- Video/playlist/item identifier (string)
862
863 Keyword arguments:
864 note -- note printed before downloading (string)
865 errnote -- note printed in case of an error (string)
866 fatal -- flag denoting whether error should be considered fatal,
867 i.e. whether it should cause ExtractionError to be raised,
868 otherwise a warning will be reported and extraction continued
869 tries -- number of tries
870 timeout -- sleep interval between tries
871 encoding -- encoding for a page content decoding, guessed automatically
872 when not explicitly specified
873 data -- POST data (bytes)
874 headers -- HTTP headers (dict)
875 query -- URL query (dict)
876 expected_status -- allows to accept failed HTTP requests (non 2xx
877 status code) by explicitly specifying a set of accepted status
878 codes. Can be any of the following entities:
879 - an integer type specifying an exact failed status code to
880 accept
881 - a list or a tuple of integer types specifying a list of
882 failed status codes to accept
883 - a callable accepting an actual failed status code and
884 returning True if it should be accepted
885 Note that this argument does not affect success status codes (2xx)
886 which are always accepted.
887 """
888
889 success = False
890 try_count = 0
891 while success is False:
892 try:
893 res = self._download_webpage_handle(
894 url_or_request, video_id, note, errnote, fatal,
895 encoding=encoding, data=data, headers=headers, query=query,
896 expected_status=expected_status)
897 success = True
898 except compat_http_client.IncompleteRead as e:
899 try_count += 1
900 if try_count >= tries:
901 raise e
902 self._sleep(timeout, video_id)
903 if res is False:
904 return res
905 else:
906 content, _ = res
907 return content
908
909 def _download_xml_handle(
910 self, url_or_request, video_id, note='Downloading XML',
911 errnote='Unable to download XML', transform_source=None,
912 fatal=True, encoding=None, data=None, headers={}, query={},
913 expected_status=None):
914 """
915 Return a tuple (xml as an compat_etree_Element, URL handle).
916
917 See _download_webpage docstring for arguments specification.
918 """
919 res = self._download_webpage_handle(
920 url_or_request, video_id, note, errnote, fatal=fatal,
921 encoding=encoding, data=data, headers=headers, query=query,
922 expected_status=expected_status)
923 if res is False:
924 return res
925 xml_string, urlh = res
926 return self._parse_xml(
927 xml_string, video_id, transform_source=transform_source,
928 fatal=fatal), urlh
929
930 def _download_xml(
931 self, url_or_request, video_id,
932 note='Downloading XML', errnote='Unable to download XML',
933 transform_source=None, fatal=True, encoding=None,
934 data=None, headers={}, query={}, expected_status=None):
935 """
936 Return the xml as an compat_etree_Element.
937
938 See _download_webpage docstring for arguments specification.
939 """
940 res = self._download_xml_handle(
941 url_or_request, video_id, note=note, errnote=errnote,
942 transform_source=transform_source, fatal=fatal, encoding=encoding,
943 data=data, headers=headers, query=query,
944 expected_status=expected_status)
945 return res if res is False else res[0]
946
947 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
948 if transform_source:
949 xml_string = transform_source(xml_string)
950 try:
951 return compat_etree_fromstring(xml_string.encode('utf-8'))
952 except compat_xml_parse_error as ve:
953 errmsg = '%s: Failed to parse XML ' % video_id
954 if fatal:
955 raise ExtractorError(errmsg, cause=ve)
956 else:
957 self.report_warning(errmsg + str(ve))
958
959 def _download_json_handle(
960 self, url_or_request, video_id, note='Downloading JSON metadata',
961 errnote='Unable to download JSON metadata', transform_source=None,
962 fatal=True, encoding=None, data=None, headers={}, query={},
963 expected_status=None):
964 """
965 Return a tuple (JSON object, URL handle).
966
967 See _download_webpage docstring for arguments specification.
968 """
969 res = self._download_webpage_handle(
970 url_or_request, video_id, note, errnote, fatal=fatal,
971 encoding=encoding, data=data, headers=headers, query=query,
972 expected_status=expected_status)
973 if res is False:
974 return res
975 json_string, urlh = res
976 return self._parse_json(
977 json_string, video_id, transform_source=transform_source,
978 fatal=fatal), urlh
979
980 def _download_json(
981 self, url_or_request, video_id, note='Downloading JSON metadata',
982 errnote='Unable to download JSON metadata', transform_source=None,
983 fatal=True, encoding=None, data=None, headers={}, query={},
984 expected_status=None):
985 """
986 Return the JSON object as a dict.
987
988 See _download_webpage docstring for arguments specification.
989 """
990 res = self._download_json_handle(
991 url_or_request, video_id, note=note, errnote=errnote,
992 transform_source=transform_source, fatal=fatal, encoding=encoding,
993 data=data, headers=headers, query=query,
994 expected_status=expected_status)
995 return res if res is False else res[0]
996
997 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
998 if transform_source:
999 json_string = transform_source(json_string)
1000 try:
1001 return json.loads(json_string)
1002 except ValueError as ve:
1003 errmsg = '%s: Failed to parse JSON ' % video_id
1004 if fatal:
1005 raise ExtractorError(errmsg, cause=ve)
1006 else:
1007 self.report_warning(errmsg + str(ve))
1008
1009 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1010 return self._parse_json(
1011 data[data.find('{'):data.rfind('}') + 1],
1012 video_id, transform_source, fatal)
1013
1014 def _download_socket_json_handle(
1015 self, url_or_request, video_id, note='Polling socket',
1016 errnote='Unable to poll socket', transform_source=None,
1017 fatal=True, encoding=None, data=None, headers={}, query={},
1018 expected_status=None):
1019 """
1020 Return a tuple (JSON object, URL handle).
1021
1022 See _download_webpage docstring for arguments specification.
1023 """
1024 res = self._download_webpage_handle(
1025 url_or_request, video_id, note, errnote, fatal=fatal,
1026 encoding=encoding, data=data, headers=headers, query=query,
1027 expected_status=expected_status)
1028 if res is False:
1029 return res
1030 webpage, urlh = res
1031 return self._parse_socket_response_as_json(
1032 webpage, video_id, transform_source=transform_source,
1033 fatal=fatal), urlh
1034
1035 def _download_socket_json(
1036 self, url_or_request, video_id, note='Polling socket',
1037 errnote='Unable to poll socket', transform_source=None,
1038 fatal=True, encoding=None, data=None, headers={}, query={},
1039 expected_status=None):
1040 """
1041 Return the JSON object as a dict.
1042
1043 See _download_webpage docstring for arguments specification.
1044 """
1045 res = self._download_socket_json_handle(
1046 url_or_request, video_id, note=note, errnote=errnote,
1047 transform_source=transform_source, fatal=fatal, encoding=encoding,
1048 data=data, headers=headers, query=query,
1049 expected_status=expected_status)
1050 return res if res is False else res[0]
1051
1052 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1053 idstr = format_field(video_id, template='%s: ')
1054 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1055 if only_once:
1056 if f'WARNING: {msg}' in self._printed_messages:
1057 return
1058 self._printed_messages.add(f'WARNING: {msg}')
1059 self._downloader.report_warning(msg, *args, **kwargs)
1060
1061 def to_screen(self, msg, *args, **kwargs):
1062 """Print msg to screen, prefixing it with '[ie_name]'"""
1063 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1064
1065 def write_debug(self, msg, *args, **kwargs):
1066 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1067
1068 def get_param(self, name, default=None, *args, **kwargs):
1069 if self._downloader:
1070 return self._downloader.params.get(name, default, *args, **kwargs)
1071 return default
1072
1073 def report_drm(self, video_id, partial=False):
1074 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1075
1076 def report_extraction(self, id_or_name):
1077 """Report information extraction."""
1078 self.to_screen('%s: Extracting information' % id_or_name)
1079
1080 def report_download_webpage(self, video_id):
1081 """Report webpage download."""
1082 self.to_screen('%s: Downloading webpage' % video_id)
1083
1084 def report_age_confirmation(self):
1085 """Report attempt to confirm age."""
1086 self.to_screen('Confirming age')
1087
1088 def report_login(self):
1089 """Report attempt to log in."""
1090 self.to_screen('Logging in')
1091
1092 def raise_login_required(
1093 self, msg='This video is only available for registered users',
1094 metadata_available=False, method='any'):
1095 if metadata_available and (
1096 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1097 self.report_warning(msg)
1098 if method is not None:
1099 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1100 raise ExtractorError(msg, expected=True)
1101
1102 def raise_geo_restricted(
1103 self, msg='This video is not available from your location due to geo restriction',
1104 countries=None, metadata_available=False):
1105 if metadata_available and (
1106 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1107 self.report_warning(msg)
1108 else:
1109 raise GeoRestrictedError(msg, countries=countries)
1110
1111 def raise_no_formats(self, msg, expected=False, video_id=None):
1112 if expected and (
1113 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1114 self.report_warning(msg, video_id)
1115 elif isinstance(msg, ExtractorError):
1116 raise msg
1117 else:
1118 raise ExtractorError(msg, expected=expected, video_id=video_id)
1119
1120 # Methods for following #608
1121 @staticmethod
1122 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1123 """Returns a URL that points to a page that should be processed"""
1124 # TODO: ie should be the class used for getting the info
1125 video_info = {'_type': 'url',
1126 'url': url,
1127 'ie_key': ie}
1128 video_info.update(kwargs)
1129 if video_id is not None:
1130 video_info['id'] = video_id
1131 if video_title is not None:
1132 video_info['title'] = video_title
1133 return video_info
1134
1135 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1136 urls = orderedSet(
1137 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1138 for m in matches)
1139 return self.playlist_result(
1140 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1141
1142 @staticmethod
1143 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1144 """Returns a playlist"""
1145 video_info = {'_type': 'playlist',
1146 'entries': entries}
1147 video_info.update(kwargs)
1148 if playlist_id:
1149 video_info['id'] = playlist_id
1150 if playlist_title:
1151 video_info['title'] = playlist_title
1152 if playlist_description is not None:
1153 video_info['description'] = playlist_description
1154 return video_info
1155
1156 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1157 """
1158 Perform a regex search on the given string, using a single or a list of
1159 patterns returning the first matching group.
1160 In case of failure return a default value or raise a WARNING or a
1161 RegexNotFoundError, depending on fatal, specifying the field name.
1162 """
1163 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1164 mobj = re.search(pattern, string, flags)
1165 else:
1166 for p in pattern:
1167 mobj = re.search(p, string, flags)
1168 if mobj:
1169 break
1170
1171 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1172
1173 if mobj:
1174 if group is None:
1175 # return the first matching group
1176 return next(g for g in mobj.groups() if g is not None)
1177 elif isinstance(group, (list, tuple)):
1178 return tuple(mobj.group(g) for g in group)
1179 else:
1180 return mobj.group(group)
1181 elif default is not NO_DEFAULT:
1182 return default
1183 elif fatal:
1184 raise RegexNotFoundError('Unable to extract %s' % _name)
1185 else:
1186 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1187 return None
1188
1189 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1190 """
1191 Like _search_regex, but strips HTML tags and unescapes entities.
1192 """
1193 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1194 if res:
1195 return clean_html(res).strip()
1196 else:
1197 return res
1198
1199 def _get_netrc_login_info(self, netrc_machine=None):
1200 username = None
1201 password = None
1202 netrc_machine = netrc_machine or self._NETRC_MACHINE
1203
1204 if self.get_param('usenetrc', False):
1205 try:
1206 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1207 if os.path.isdir(netrc_file):
1208 netrc_file = os.path.join(netrc_file, '.netrc')
1209 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1210 if info is not None:
1211 username = info[0]
1212 password = info[2]
1213 else:
1214 raise netrc.NetrcParseError(
1215 'No authenticators for %s' % netrc_machine)
1216 except (IOError, netrc.NetrcParseError) as err:
1217 self.report_warning(
1218 'parsing .netrc: %s' % error_to_compat_str(err))
1219
1220 return username, password
1221
1222 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1223 """
1224 Get the login info as (username, password)
1225 First look for the manually specified credentials using username_option
1226 and password_option as keys in params dictionary. If no such credentials
1227 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1228 value.
1229 If there's no info available, return (None, None)
1230 """
1231
1232 # Attempt to use provided username and password or .netrc data
1233 username = self.get_param(username_option)
1234 if username is not None:
1235 password = self.get_param(password_option)
1236 else:
1237 username, password = self._get_netrc_login_info(netrc_machine)
1238
1239 return username, password
1240
1241 def _get_tfa_info(self, note='two-factor verification code'):
1242 """
1243 Get the two-factor authentication info
1244 TODO - asking the user will be required for sms/phone verify
1245 currently just uses the command line option
1246 If there's no info available, return None
1247 """
1248
1249 tfa = self.get_param('twofactor')
1250 if tfa is not None:
1251 return tfa
1252
1253 return compat_getpass('Type %s and press [Return]: ' % note)
1254
1255 # Helper functions for extracting OpenGraph info
1256 @staticmethod
1257 def _og_regexes(prop):
1258 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1259 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1260 % {'prop': re.escape(prop)})
1261 template = r'<meta[^>]+?%s[^>]+?%s'
1262 return [
1263 template % (property_re, content_re),
1264 template % (content_re, property_re),
1265 ]
1266
1267 @staticmethod
1268 def _meta_regex(prop):
1269 return r'''(?isx)<meta
1270 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1271 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1272
1273 def _og_search_property(self, prop, html, name=None, **kargs):
1274 prop = variadic(prop)
1275 if name is None:
1276 name = 'OpenGraph %s' % prop[0]
1277 og_regexes = []
1278 for p in prop:
1279 og_regexes.extend(self._og_regexes(p))
1280 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1281 if escaped is None:
1282 return None
1283 return unescapeHTML(escaped)
1284
1285 def _og_search_thumbnail(self, html, **kargs):
1286 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1287
1288 def _og_search_description(self, html, **kargs):
1289 return self._og_search_property('description', html, fatal=False, **kargs)
1290
1291 def _og_search_title(self, html, **kargs):
1292 return self._og_search_property('title', html, **kargs)
1293
1294 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1295 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1296 if secure:
1297 regexes = self._og_regexes('video:secure_url') + regexes
1298 return self._html_search_regex(regexes, html, name, **kargs)
1299
1300 def _og_search_url(self, html, **kargs):
1301 return self._og_search_property('url', html, **kargs)
1302
1303 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1304 name = variadic(name)
1305 if display_name is None:
1306 display_name = name[0]
1307 return self._html_search_regex(
1308 [self._meta_regex(n) for n in name],
1309 html, display_name, fatal=fatal, group='content', **kwargs)
1310
1311 def _dc_search_uploader(self, html):
1312 return self._html_search_meta('dc.creator', html, 'uploader')
1313
1314 def _rta_search(self, html):
1315 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1316 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1317 r' content="RTA-5042-1996-1400-1577-RTA"',
1318 html):
1319 return 18
1320 return 0
1321
1322 def _media_rating_search(self, html):
1323 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1324 rating = self._html_search_meta('rating', html)
1325
1326 if not rating:
1327 return None
1328
1329 RATING_TABLE = {
1330 'safe for kids': 0,
1331 'general': 8,
1332 '14 years': 14,
1333 'mature': 17,
1334 'restricted': 19,
1335 }
1336 return RATING_TABLE.get(rating.lower())
1337
1338 def _family_friendly_search(self, html):
1339 # See http://schema.org/VideoObject
1340 family_friendly = self._html_search_meta(
1341 'isFamilyFriendly', html, default=None)
1342
1343 if not family_friendly:
1344 return None
1345
1346 RATING_TABLE = {
1347 '1': 0,
1348 'true': 0,
1349 '0': 18,
1350 'false': 18,
1351 }
1352 return RATING_TABLE.get(family_friendly.lower())
1353
1354 def _twitter_search_player(self, html):
1355 return self._html_search_meta('twitter:player', html,
1356 'twitter card player')
1357
1358 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1359 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1360 default = kwargs.get('default', NO_DEFAULT)
1361 # JSON-LD may be malformed and thus `fatal` should be respected.
1362 # At the same time `default` may be passed that assumes `fatal=False`
1363 # for _search_regex. Let's simulate the same behavior here as well.
1364 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1365 json_ld = []
1366 for mobj in json_ld_list:
1367 json_ld_item = self._parse_json(
1368 mobj.group('json_ld'), video_id, fatal=fatal)
1369 if not json_ld_item:
1370 continue
1371 if isinstance(json_ld_item, dict):
1372 json_ld.append(json_ld_item)
1373 elif isinstance(json_ld_item, (list, tuple)):
1374 json_ld.extend(json_ld_item)
1375 if json_ld:
1376 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1377 if json_ld:
1378 return json_ld
1379 if default is not NO_DEFAULT:
1380 return default
1381 elif fatal:
1382 raise RegexNotFoundError('Unable to extract JSON-LD')
1383 else:
1384 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1385 return {}
1386
1387 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1388 if isinstance(json_ld, compat_str):
1389 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1390 if not json_ld:
1391 return {}
1392 info = {}
1393 if not isinstance(json_ld, (list, tuple, dict)):
1394 return info
1395 if isinstance(json_ld, dict):
1396 json_ld = [json_ld]
1397
1398 INTERACTION_TYPE_MAP = {
1399 'CommentAction': 'comment',
1400 'AgreeAction': 'like',
1401 'DisagreeAction': 'dislike',
1402 'LikeAction': 'like',
1403 'DislikeAction': 'dislike',
1404 'ListenAction': 'view',
1405 'WatchAction': 'view',
1406 'ViewAction': 'view',
1407 }
1408
1409 def extract_interaction_type(e):
1410 interaction_type = e.get('interactionType')
1411 if isinstance(interaction_type, dict):
1412 interaction_type = interaction_type.get('@type')
1413 return str_or_none(interaction_type)
1414
1415 def extract_interaction_statistic(e):
1416 interaction_statistic = e.get('interactionStatistic')
1417 if isinstance(interaction_statistic, dict):
1418 interaction_statistic = [interaction_statistic]
1419 if not isinstance(interaction_statistic, list):
1420 return
1421 for is_e in interaction_statistic:
1422 if not isinstance(is_e, dict):
1423 continue
1424 if is_e.get('@type') != 'InteractionCounter':
1425 continue
1426 interaction_type = extract_interaction_type(is_e)
1427 if not interaction_type:
1428 continue
1429 # For interaction count some sites provide string instead of
1430 # an integer (as per spec) with non digit characters (e.g. ",")
1431 # so extracting count with more relaxed str_to_int
1432 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1433 if interaction_count is None:
1434 continue
1435 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1436 if not count_kind:
1437 continue
1438 count_key = '%s_count' % count_kind
1439 if info.get(count_key) is not None:
1440 continue
1441 info[count_key] = interaction_count
1442
1443 def extract_chapter_information(e):
1444 chapters = [{
1445 'title': part.get('name'),
1446 'start_time': part.get('startOffset'),
1447 'end_time': part.get('endOffset'),
1448 } for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
1449 for idx, (last_c, current_c, next_c) in enumerate(zip(
1450 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1451 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1452 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1453 if None in current_c.values():
1454 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1455 return
1456 if chapters:
1457 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1458 info['chapters'] = chapters
1459
1460 def extract_video_object(e):
1461 assert e['@type'] == 'VideoObject'
1462 author = e.get('author')
1463 info.update({
1464 'url': url_or_none(e.get('contentUrl')),
1465 'title': unescapeHTML(e.get('name')),
1466 'description': unescapeHTML(e.get('description')),
1467 'thumbnails': [{'url': url_or_none(url)}
1468 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1469 'duration': parse_duration(e.get('duration')),
1470 'timestamp': unified_timestamp(e.get('uploadDate')),
1471 # author can be an instance of 'Organization' or 'Person' types.
1472 # both types can have 'name' property(inherited from 'Thing' type). [1]
1473 # however some websites are using 'Text' type instead.
1474 # 1. https://schema.org/VideoObject
1475 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1476 'filesize': float_or_none(e.get('contentSize')),
1477 'tbr': int_or_none(e.get('bitrate')),
1478 'width': int_or_none(e.get('width')),
1479 'height': int_or_none(e.get('height')),
1480 'view_count': int_or_none(e.get('interactionCount')),
1481 })
1482 extract_interaction_statistic(e)
1483 extract_chapter_information(e)
1484
1485 def traverse_json_ld(json_ld, at_top_level=True):
1486 for e in json_ld:
1487 if at_top_level and '@context' not in e:
1488 continue
1489 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1490 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1491 break
1492 item_type = e.get('@type')
1493 if expected_type is not None and expected_type != item_type:
1494 continue
1495 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1496 if rating is not None:
1497 info['average_rating'] = rating
1498 if item_type in ('TVEpisode', 'Episode'):
1499 episode_name = unescapeHTML(e.get('name'))
1500 info.update({
1501 'episode': episode_name,
1502 'episode_number': int_or_none(e.get('episodeNumber')),
1503 'description': unescapeHTML(e.get('description')),
1504 })
1505 if not info.get('title') and episode_name:
1506 info['title'] = episode_name
1507 part_of_season = e.get('partOfSeason')
1508 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1509 info.update({
1510 'season': unescapeHTML(part_of_season.get('name')),
1511 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1512 })
1513 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1514 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1515 info['series'] = unescapeHTML(part_of_series.get('name'))
1516 elif item_type == 'Movie':
1517 info.update({
1518 'title': unescapeHTML(e.get('name')),
1519 'description': unescapeHTML(e.get('description')),
1520 'duration': parse_duration(e.get('duration')),
1521 'timestamp': unified_timestamp(e.get('dateCreated')),
1522 })
1523 elif item_type in ('Article', 'NewsArticle'):
1524 info.update({
1525 'timestamp': parse_iso8601(e.get('datePublished')),
1526 'title': unescapeHTML(e.get('headline')),
1527 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1528 })
1529 elif item_type == 'VideoObject':
1530 extract_video_object(e)
1531 if expected_type is None:
1532 continue
1533 else:
1534 break
1535 video = e.get('video')
1536 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1537 extract_video_object(video)
1538 if expected_type is None:
1539 continue
1540 else:
1541 break
1542 traverse_json_ld(json_ld)
1543
1544 return dict((k, v) for k, v in info.items() if v is not None)
1545
1546 def _search_nextjs_data(self, webpage, video_id, **kw):
1547 return self._parse_json(
1548 self._search_regex(
1549 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1550 webpage, 'next.js data', **kw),
1551 video_id, **kw)
1552
1553 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1554 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1555 # not all website do this, but it can be changed
1556 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1557 rectx = re.escape(context_name)
1558 js, arg_keys, arg_vals = self._search_regex(
1559 (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1560 r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1561 webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1562
1563 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1564
1565 for key, val in args.items():
1566 if val in ('undefined', 'void 0'):
1567 args[key] = 'null'
1568
1569 return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1570
1571 @staticmethod
1572 def _hidden_inputs(html):
1573 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1574 hidden_inputs = {}
1575 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1576 attrs = extract_attributes(input)
1577 if not input:
1578 continue
1579 if attrs.get('type') not in ('hidden', 'submit'):
1580 continue
1581 name = attrs.get('name') or attrs.get('id')
1582 value = attrs.get('value')
1583 if name and value is not None:
1584 hidden_inputs[name] = value
1585 return hidden_inputs
1586
1587 def _form_hidden_inputs(self, form_id, html):
1588 form = self._search_regex(
1589 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1590 html, '%s form' % form_id, group='form')
1591 return self._hidden_inputs(form)
1592
1593 class FormatSort:
1594 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1595
1596 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1597 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1598 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1599 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1600 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1601 'fps', 'fs_approx', 'source', 'id')
1602
1603 settings = {
1604 'vcodec': {'type': 'ordered', 'regex': True,
1605 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1606 'acodec': {'type': 'ordered', 'regex': True,
1607 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1608 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1609 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1610 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1611 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1612 'vext': {'type': 'ordered', 'field': 'video_ext',
1613 'order': ('mp4', 'webm', 'flv', '', 'none'),
1614 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1615 'aext': {'type': 'ordered', 'field': 'audio_ext',
1616 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1617 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1618 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1619 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1620 'field': ('vcodec', 'acodec'),
1621 'function': lambda it: int(any(v != 'none' for v in it))},
1622 'ie_pref': {'priority': True, 'type': 'extractor'},
1623 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1624 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1625 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1626 'quality': {'convert': 'float', 'default': -1},
1627 'filesize': {'convert': 'bytes'},
1628 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1629 'id': {'convert': 'string', 'field': 'format_id'},
1630 'height': {'convert': 'float_none'},
1631 'width': {'convert': 'float_none'},
1632 'fps': {'convert': 'float_none'},
1633 'tbr': {'convert': 'float_none'},
1634 'vbr': {'convert': 'float_none'},
1635 'abr': {'convert': 'float_none'},
1636 'asr': {'convert': 'float_none'},
1637 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1638
1639 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1640 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1641 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1642 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1643 'res': {'type': 'multiple', 'field': ('height', 'width'),
1644 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1645
1646 # For compatibility with youtube-dl
1647 'format_id': {'type': 'alias', 'field': 'id'},
1648 'preference': {'type': 'alias', 'field': 'ie_pref'},
1649 'language_preference': {'type': 'alias', 'field': 'lang'},
1650
1651 # Deprecated
1652 'dimension': {'type': 'alias', 'field': 'res'},
1653 'resolution': {'type': 'alias', 'field': 'res'},
1654 'extension': {'type': 'alias', 'field': 'ext'},
1655 'bitrate': {'type': 'alias', 'field': 'br'},
1656 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1657 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1658 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1659 'framerate': {'type': 'alias', 'field': 'fps'},
1660 'protocol': {'type': 'alias', 'field': 'proto'},
1661 'source_preference': {'type': 'alias', 'field': 'source'},
1662 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1663 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1664 'samplerate': {'type': 'alias', 'field': 'asr'},
1665 'video_ext': {'type': 'alias', 'field': 'vext'},
1666 'audio_ext': {'type': 'alias', 'field': 'aext'},
1667 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1668 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1669 'video': {'type': 'alias', 'field': 'hasvid'},
1670 'has_video': {'type': 'alias', 'field': 'hasvid'},
1671 'audio': {'type': 'alias', 'field': 'hasaud'},
1672 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1673 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1674 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1675 }
1676
1677 def __init__(self, ie, field_preference):
1678 self._order = []
1679 self.ydl = ie._downloader
1680 self.evaluate_params(self.ydl.params, field_preference)
1681 if ie.get_param('verbose'):
1682 self.print_verbose_info(self.ydl.write_debug)
1683
1684 def _get_field_setting(self, field, key):
1685 if field not in self.settings:
1686 if key in ('forced', 'priority'):
1687 return False
1688 self.ydl.deprecation_warning(
1689 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1690 'and may be removed in a future version')
1691 self.settings[field] = {}
1692 propObj = self.settings[field]
1693 if key not in propObj:
1694 type = propObj.get('type')
1695 if key == 'field':
1696 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1697 elif key == 'convert':
1698 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1699 else:
1700 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1701 propObj[key] = default
1702 return propObj[key]
1703
1704 def _resolve_field_value(self, field, value, convertNone=False):
1705 if value is None:
1706 if not convertNone:
1707 return None
1708 else:
1709 value = value.lower()
1710 conversion = self._get_field_setting(field, 'convert')
1711 if conversion == 'ignore':
1712 return None
1713 if conversion == 'string':
1714 return value
1715 elif conversion == 'float_none':
1716 return float_or_none(value)
1717 elif conversion == 'bytes':
1718 return FileDownloader.parse_bytes(value)
1719 elif conversion == 'order':
1720 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1721 use_regex = self._get_field_setting(field, 'regex')
1722 list_length = len(order_list)
1723 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1724 if use_regex and value is not None:
1725 for i, regex in enumerate(order_list):
1726 if regex and re.match(regex, value):
1727 return list_length - i
1728 return list_length - empty_pos # not in list
1729 else: # not regex or value = None
1730 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1731 else:
1732 if value.isnumeric():
1733 return float(value)
1734 else:
1735 self.settings[field]['convert'] = 'string'
1736 return value
1737
1738 def evaluate_params(self, params, sort_extractor):
1739 self._use_free_order = params.get('prefer_free_formats', False)
1740 self._sort_user = params.get('format_sort', [])
1741 self._sort_extractor = sort_extractor
1742
1743 def add_item(field, reverse, closest, limit_text):
1744 field = field.lower()
1745 if field in self._order:
1746 return
1747 self._order.append(field)
1748 limit = self._resolve_field_value(field, limit_text)
1749 data = {
1750 'reverse': reverse,
1751 'closest': False if limit is None else closest,
1752 'limit_text': limit_text,
1753 'limit': limit}
1754 if field in self.settings:
1755 self.settings[field].update(data)
1756 else:
1757 self.settings[field] = data
1758
1759 sort_list = (
1760 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1761 + (tuple() if params.get('format_sort_force', False)
1762 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1763 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1764
1765 for item in sort_list:
1766 match = re.match(self.regex, item)
1767 if match is None:
1768 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1769 field = match.group('field')
1770 if field is None:
1771 continue
1772 if self._get_field_setting(field, 'type') == 'alias':
1773 alias, field = field, self._get_field_setting(field, 'field')
1774 if alias not in ('format_id', 'preference', 'language_preference'):
1775 self.ydl.deprecation_warning(
1776 f'Format sorting alias {alias} is deprecated '
1777 f'and may be removed in a future version. Please use {field} instead')
1778 reverse = match.group('reverse') is not None
1779 closest = match.group('separator') == '~'
1780 limit_text = match.group('limit')
1781
1782 has_limit = limit_text is not None
1783 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1784 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1785
1786 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1787 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1788 limit_count = len(limits)
1789 for (i, f) in enumerate(fields):
1790 add_item(f, reverse, closest,
1791 limits[i] if i < limit_count
1792 else limits[0] if has_limit and not has_multiple_limits
1793 else None)
1794
1795 def print_verbose_info(self, write_debug):
1796 if self._sort_user:
1797 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1798 if self._sort_extractor:
1799 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1800 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1801 '+' if self._get_field_setting(field, 'reverse') else '', field,
1802 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1803 self._get_field_setting(field, 'limit_text'),
1804 self._get_field_setting(field, 'limit'))
1805 if self._get_field_setting(field, 'limit_text') is not None else '')
1806 for field in self._order if self._get_field_setting(field, 'visible')]))
1807
1808 def _calculate_field_preference_from_value(self, format, field, type, value):
1809 reverse = self._get_field_setting(field, 'reverse')
1810 closest = self._get_field_setting(field, 'closest')
1811 limit = self._get_field_setting(field, 'limit')
1812
1813 if type == 'extractor':
1814 maximum = self._get_field_setting(field, 'max')
1815 if value is None or (maximum is not None and value >= maximum):
1816 value = -1
1817 elif type == 'boolean':
1818 in_list = self._get_field_setting(field, 'in_list')
1819 not_in_list = self._get_field_setting(field, 'not_in_list')
1820 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1821 elif type == 'ordered':
1822 value = self._resolve_field_value(field, value, True)
1823
1824 # try to convert to number
1825 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1826 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1827 if is_num:
1828 value = val_num
1829
1830 return ((-10, 0) if value is None
1831 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1832 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1833 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1834 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1835 else (-1, value, 0))
1836
1837 def _calculate_field_preference(self, format, field):
1838 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1839 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1840 if type == 'multiple':
1841 type = 'field' # Only 'field' is allowed in multiple for now
1842 actual_fields = self._get_field_setting(field, 'field')
1843
1844 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1845 else:
1846 value = get_value(field)
1847 return self._calculate_field_preference_from_value(format, field, type, value)
1848
1849 def calculate_preference(self, format):
1850 # Determine missing protocol
1851 if not format.get('protocol'):
1852 format['protocol'] = determine_protocol(format)
1853
1854 # Determine missing ext
1855 if not format.get('ext') and 'url' in format:
1856 format['ext'] = determine_ext(format['url'])
1857 if format.get('vcodec') == 'none':
1858 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1859 format['video_ext'] = 'none'
1860 else:
1861 format['video_ext'] = format['ext']
1862 format['audio_ext'] = 'none'
1863 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1864 # format['preference'] = -1000
1865
1866 # Determine missing bitrates
1867 if format.get('tbr') is None:
1868 if format.get('vbr') is not None and format.get('abr') is not None:
1869 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1870 else:
1871 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1872 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1873 if format.get('acodec') != 'none' and format.get('abr') is None:
1874 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1875
1876 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1877
1878 def _sort_formats(self, formats, field_preference=[]):
1879 if not formats:
1880 return
1881 format_sort = self.FormatSort(self, field_preference)
1882 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1883
1884 def _check_formats(self, formats, video_id):
1885 if formats:
1886 formats[:] = filter(
1887 lambda f: self._is_valid_url(
1888 f['url'], video_id,
1889 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1890 formats)
1891
1892 @staticmethod
1893 def _remove_duplicate_formats(formats):
1894 format_urls = set()
1895 unique_formats = []
1896 for f in formats:
1897 if f['url'] not in format_urls:
1898 format_urls.add(f['url'])
1899 unique_formats.append(f)
1900 formats[:] = unique_formats
1901
1902 def _is_valid_url(self, url, video_id, item='video', headers={}):
1903 url = self._proto_relative_url(url, scheme='http:')
1904 # For now assume non HTTP(S) URLs always valid
1905 if not (url.startswith('http://') or url.startswith('https://')):
1906 return True
1907 try:
1908 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1909 return True
1910 except ExtractorError as e:
1911 self.to_screen(
1912 '%s: %s URL is invalid, skipping: %s'
1913 % (video_id, item, error_to_compat_str(e.cause)))
1914 return False
1915
1916 def http_scheme(self):
1917 """ Either "http:" or "https:", depending on the user's preferences """
1918 return (
1919 'http:'
1920 if self.get_param('prefer_insecure', False)
1921 else 'https:')
1922
1923 def _proto_relative_url(self, url, scheme=None):
1924 if url is None:
1925 return url
1926 if url.startswith('//'):
1927 if scheme is None:
1928 scheme = self.http_scheme()
1929 return scheme + url
1930 else:
1931 return url
1932
1933 def _sleep(self, timeout, video_id, msg_template=None):
1934 if msg_template is None:
1935 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1936 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1937 self.to_screen(msg)
1938 time.sleep(timeout)
1939
1940 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1941 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1942 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1943 manifest = self._download_xml(
1944 manifest_url, video_id, 'Downloading f4m manifest',
1945 'Unable to download f4m manifest',
1946 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1947 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1948 transform_source=transform_source,
1949 fatal=fatal, data=data, headers=headers, query=query)
1950
1951 if manifest is False:
1952 return []
1953
1954 return self._parse_f4m_formats(
1955 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1956 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1957
1958 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1959 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1960 fatal=True, m3u8_id=None):
1961 if not isinstance(manifest, compat_etree_Element) and not fatal:
1962 return []
1963
1964 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1965 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1966 if akamai_pv is not None and ';' in akamai_pv.text:
1967 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1968 if playerVerificationChallenge.strip() != '':
1969 return []
1970
1971 formats = []
1972 manifest_version = '1.0'
1973 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1974 if not media_nodes:
1975 manifest_version = '2.0'
1976 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1977 # Remove unsupported DRM protected media from final formats
1978 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1979 media_nodes = remove_encrypted_media(media_nodes)
1980 if not media_nodes:
1981 return formats
1982
1983 manifest_base_url = get_base_url(manifest)
1984
1985 bootstrap_info = xpath_element(
1986 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1987 'bootstrap info', default=None)
1988
1989 vcodec = None
1990 mime_type = xpath_text(
1991 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1992 'base URL', default=None)
1993 if mime_type and mime_type.startswith('audio/'):
1994 vcodec = 'none'
1995
1996 for i, media_el in enumerate(media_nodes):
1997 tbr = int_or_none(media_el.attrib.get('bitrate'))
1998 width = int_or_none(media_el.attrib.get('width'))
1999 height = int_or_none(media_el.attrib.get('height'))
2000 format_id = join_nonempty(f4m_id, tbr or i)
2001 # If <bootstrapInfo> is present, the specified f4m is a
2002 # stream-level manifest, and only set-level manifests may refer to
2003 # external resources. See section 11.4 and section 4 of F4M spec
2004 if bootstrap_info is None:
2005 media_url = None
2006 # @href is introduced in 2.0, see section 11.6 of F4M spec
2007 if manifest_version == '2.0':
2008 media_url = media_el.attrib.get('href')
2009 if media_url is None:
2010 media_url = media_el.attrib.get('url')
2011 if not media_url:
2012 continue
2013 manifest_url = (
2014 media_url if media_url.startswith('http://') or media_url.startswith('https://')
2015 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2016 # If media_url is itself a f4m manifest do the recursive extraction
2017 # since bitrates in parent manifest (this one) and media_url manifest
2018 # may differ leading to inability to resolve the format by requested
2019 # bitrate in f4m downloader
2020 ext = determine_ext(manifest_url)
2021 if ext == 'f4m':
2022 f4m_formats = self._extract_f4m_formats(
2023 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2024 transform_source=transform_source, fatal=fatal)
2025 # Sometimes stream-level manifest contains single media entry that
2026 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2027 # At the same time parent's media entry in set-level manifest may
2028 # contain it. We will copy it from parent in such cases.
2029 if len(f4m_formats) == 1:
2030 f = f4m_formats[0]
2031 f.update({
2032 'tbr': f.get('tbr') or tbr,
2033 'width': f.get('width') or width,
2034 'height': f.get('height') or height,
2035 'format_id': f.get('format_id') if not tbr else format_id,
2036 'vcodec': vcodec,
2037 })
2038 formats.extend(f4m_formats)
2039 continue
2040 elif ext == 'm3u8':
2041 formats.extend(self._extract_m3u8_formats(
2042 manifest_url, video_id, 'mp4', preference=preference,
2043 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2044 continue
2045 formats.append({
2046 'format_id': format_id,
2047 'url': manifest_url,
2048 'manifest_url': manifest_url,
2049 'ext': 'flv' if bootstrap_info is not None else None,
2050 'protocol': 'f4m',
2051 'tbr': tbr,
2052 'width': width,
2053 'height': height,
2054 'vcodec': vcodec,
2055 'preference': preference,
2056 'quality': quality,
2057 })
2058 return formats
2059
2060 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2061 return {
2062 'format_id': join_nonempty(m3u8_id, 'meta'),
2063 'url': m3u8_url,
2064 'ext': ext,
2065 'protocol': 'm3u8',
2066 'preference': preference - 100 if preference else -100,
2067 'quality': quality,
2068 'resolution': 'multiple',
2069 'format_note': 'Quality selection URL',
2070 }
2071
2072 def _report_ignoring_subs(self, name):
2073 self.report_warning(bug_reports_message(
2074 f'Ignoring subtitle tracks found in the {name} manifest; '
2075 'if any subtitle tracks are missing,'
2076 ), only_once=True)
2077
2078 def _extract_m3u8_formats(self, *args, **kwargs):
2079 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2080 if subs:
2081 self._report_ignoring_subs('HLS')
2082 return fmts
2083
2084 def _extract_m3u8_formats_and_subtitles(
2085 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2086 preference=None, quality=None, m3u8_id=None, note=None,
2087 errnote=None, fatal=True, live=False, data=None, headers={},
2088 query={}):
2089
2090 res = self._download_webpage_handle(
2091 m3u8_url, video_id,
2092 note='Downloading m3u8 information' if note is None else note,
2093 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2094 fatal=fatal, data=data, headers=headers, query=query)
2095
2096 if res is False:
2097 return [], {}
2098
2099 m3u8_doc, urlh = res
2100 m3u8_url = urlh.geturl()
2101
2102 return self._parse_m3u8_formats_and_subtitles(
2103 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2104 preference=preference, quality=quality, m3u8_id=m3u8_id,
2105 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2106 headers=headers, query=query, video_id=video_id)
2107
2108 def _parse_m3u8_formats_and_subtitles(
2109 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2110 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2111 errnote=None, fatal=True, data=None, headers={}, query={},
2112 video_id=None):
2113 formats, subtitles = [], {}
2114
2115 has_drm = re.search('|'.join([
2116 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2117 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2118 ]), m3u8_doc)
2119
2120 def format_url(url):
2121 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2122
2123 if self.get_param('hls_split_discontinuity', False):
2124 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2125 if not m3u8_doc:
2126 if not manifest_url:
2127 return []
2128 m3u8_doc = self._download_webpage(
2129 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2130 note=False, errnote='Failed to download m3u8 playlist information')
2131 if m3u8_doc is False:
2132 return []
2133 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2134
2135 else:
2136 def _extract_m3u8_playlist_indices(*args, **kwargs):
2137 return [None]
2138
2139 # References:
2140 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2141 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2142 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2143
2144 # We should try extracting formats only from master playlists [1, 4.3.4],
2145 # i.e. playlists that describe available qualities. On the other hand
2146 # media playlists [1, 4.3.3] should be returned as is since they contain
2147 # just the media without qualities renditions.
2148 # Fortunately, master playlist can be easily distinguished from media
2149 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2150 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2151 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2152 # media playlist and MUST NOT appear in master playlist thus we can
2153 # clearly detect media playlist with this criterion.
2154
2155 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2156 formats = [{
2157 'format_id': join_nonempty(m3u8_id, idx),
2158 'format_index': idx,
2159 'url': m3u8_url,
2160 'ext': ext,
2161 'protocol': entry_protocol,
2162 'preference': preference,
2163 'quality': quality,
2164 'has_drm': has_drm,
2165 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2166
2167 return formats, subtitles
2168
2169 groups = {}
2170 last_stream_inf = {}
2171
2172 def extract_media(x_media_line):
2173 media = parse_m3u8_attributes(x_media_line)
2174 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2175 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2176 if not (media_type and group_id and name):
2177 return
2178 groups.setdefault(group_id, []).append(media)
2179 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2180 if media_type == 'SUBTITLES':
2181 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2182 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2183 # However, lack of URI has been spotted in the wild.
2184 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2185 if not media.get('URI'):
2186 return
2187 url = format_url(media['URI'])
2188 sub_info = {
2189 'url': url,
2190 'ext': determine_ext(url),
2191 }
2192 if sub_info['ext'] == 'm3u8':
2193 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2194 # files may contain is WebVTT:
2195 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2196 sub_info['ext'] = 'vtt'
2197 sub_info['protocol'] = 'm3u8_native'
2198 lang = media.get('LANGUAGE') or 'und'
2199 subtitles.setdefault(lang, []).append(sub_info)
2200 if media_type not in ('VIDEO', 'AUDIO'):
2201 return
2202 media_url = media.get('URI')
2203 if media_url:
2204 manifest_url = format_url(media_url)
2205 formats.extend({
2206 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2207 'format_note': name,
2208 'format_index': idx,
2209 'url': manifest_url,
2210 'manifest_url': m3u8_url,
2211 'language': media.get('LANGUAGE'),
2212 'ext': ext,
2213 'protocol': entry_protocol,
2214 'preference': preference,
2215 'quality': quality,
2216 'vcodec': 'none' if media_type == 'AUDIO' else None,
2217 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2218
2219 def build_stream_name():
2220 # Despite specification does not mention NAME attribute for
2221 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2222 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2223 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2224 stream_name = last_stream_inf.get('NAME')
2225 if stream_name:
2226 return stream_name
2227 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2228 # from corresponding rendition group
2229 stream_group_id = last_stream_inf.get('VIDEO')
2230 if not stream_group_id:
2231 return
2232 stream_group = groups.get(stream_group_id)
2233 if not stream_group:
2234 return stream_group_id
2235 rendition = stream_group[0]
2236 return rendition.get('NAME') or stream_group_id
2237
2238 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2239 # chance to detect video only formats when EXT-X-STREAM-INF tags
2240 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2241 for line in m3u8_doc.splitlines():
2242 if line.startswith('#EXT-X-MEDIA:'):
2243 extract_media(line)
2244
2245 for line in m3u8_doc.splitlines():
2246 if line.startswith('#EXT-X-STREAM-INF:'):
2247 last_stream_inf = parse_m3u8_attributes(line)
2248 elif line.startswith('#') or not line.strip():
2249 continue
2250 else:
2251 tbr = float_or_none(
2252 last_stream_inf.get('AVERAGE-BANDWIDTH')
2253 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2254 manifest_url = format_url(line.strip())
2255
2256 for idx in _extract_m3u8_playlist_indices(manifest_url):
2257 format_id = [m3u8_id, None, idx]
2258 # Bandwidth of live streams may differ over time thus making
2259 # format_id unpredictable. So it's better to keep provided
2260 # format_id intact.
2261 if not live:
2262 stream_name = build_stream_name()
2263 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2264 f = {
2265 'format_id': join_nonempty(*format_id),
2266 'format_index': idx,
2267 'url': manifest_url,
2268 'manifest_url': m3u8_url,
2269 'tbr': tbr,
2270 'ext': ext,
2271 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2272 'protocol': entry_protocol,
2273 'preference': preference,
2274 'quality': quality,
2275 }
2276 resolution = last_stream_inf.get('RESOLUTION')
2277 if resolution:
2278 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2279 if mobj:
2280 f['width'] = int(mobj.group('width'))
2281 f['height'] = int(mobj.group('height'))
2282 # Unified Streaming Platform
2283 mobj = re.search(
2284 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2285 if mobj:
2286 abr, vbr = mobj.groups()
2287 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2288 f.update({
2289 'vbr': vbr,
2290 'abr': abr,
2291 })
2292 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2293 f.update(codecs)
2294 audio_group_id = last_stream_inf.get('AUDIO')
2295 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2296 # references a rendition group MUST have a CODECS attribute.
2297 # However, this is not always respected, for example, [2]
2298 # contains EXT-X-STREAM-INF tag which references AUDIO
2299 # rendition group but does not have CODECS and despite
2300 # referencing an audio group it represents a complete
2301 # (with audio and video) format. So, for such cases we will
2302 # ignore references to rendition groups and treat them
2303 # as complete formats.
2304 if audio_group_id and codecs and f.get('vcodec') != 'none':
2305 audio_group = groups.get(audio_group_id)
2306 if audio_group and audio_group[0].get('URI'):
2307 # TODO: update acodec for audio only formats with
2308 # the same GROUP-ID
2309 f['acodec'] = 'none'
2310 if not f.get('ext'):
2311 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2312 formats.append(f)
2313
2314 # for DailyMotion
2315 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2316 if progressive_uri:
2317 http_f = f.copy()
2318 del http_f['manifest_url']
2319 http_f.update({
2320 'format_id': f['format_id'].replace('hls-', 'http-'),
2321 'protocol': 'http',
2322 'url': progressive_uri,
2323 })
2324 formats.append(http_f)
2325
2326 last_stream_inf = {}
2327 return formats, subtitles
2328
2329 def _extract_m3u8_vod_duration(
2330 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2331
2332 m3u8_vod = self._download_webpage(
2333 m3u8_vod_url, video_id,
2334 note='Downloading m3u8 VOD manifest' if note is None else note,
2335 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2336 fatal=False, data=data, headers=headers, query=query)
2337
2338 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2339
2340 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2341 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2342 return None
2343
2344 return int(sum(
2345 float(line[len('#EXTINF:'):].split(',')[0])
2346 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2347
2348 @staticmethod
2349 def _xpath_ns(path, namespace=None):
2350 if not namespace:
2351 return path
2352 out = []
2353 for c in path.split('/'):
2354 if not c or c == '.':
2355 out.append(c)
2356 else:
2357 out.append('{%s}%s' % (namespace, c))
2358 return '/'.join(out)
2359
2360 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2361 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2362
2363 if smil is False:
2364 assert not fatal
2365 return [], {}
2366
2367 namespace = self._parse_smil_namespace(smil)
2368
2369 fmts = self._parse_smil_formats(
2370 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2371 subs = self._parse_smil_subtitles(
2372 smil, namespace=namespace)
2373
2374 return fmts, subs
2375
2376 def _extract_smil_formats(self, *args, **kwargs):
2377 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2378 if subs:
2379 self._report_ignoring_subs('SMIL')
2380 return fmts
2381
2382 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2383 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2384 if smil is False:
2385 return {}
2386 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2387
2388 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2389 return self._download_xml(
2390 smil_url, video_id, 'Downloading SMIL file',
2391 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2392
2393 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2394 namespace = self._parse_smil_namespace(smil)
2395
2396 formats = self._parse_smil_formats(
2397 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2398 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2399
2400 video_id = os.path.splitext(url_basename(smil_url))[0]
2401 title = None
2402 description = None
2403 upload_date = None
2404 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2405 name = meta.attrib.get('name')
2406 content = meta.attrib.get('content')
2407 if not name or not content:
2408 continue
2409 if not title and name == 'title':
2410 title = content
2411 elif not description and name in ('description', 'abstract'):
2412 description = content
2413 elif not upload_date and name == 'date':
2414 upload_date = unified_strdate(content)
2415
2416 thumbnails = [{
2417 'id': image.get('type'),
2418 'url': image.get('src'),
2419 'width': int_or_none(image.get('width')),
2420 'height': int_or_none(image.get('height')),
2421 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2422
2423 return {
2424 'id': video_id,
2425 'title': title or video_id,
2426 'description': description,
2427 'upload_date': upload_date,
2428 'thumbnails': thumbnails,
2429 'formats': formats,
2430 'subtitles': subtitles,
2431 }
2432
2433 def _parse_smil_namespace(self, smil):
2434 return self._search_regex(
2435 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2436
2437 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2438 base = smil_url
2439 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2440 b = meta.get('base') or meta.get('httpBase')
2441 if b:
2442 base = b
2443 break
2444
2445 formats = []
2446 rtmp_count = 0
2447 http_count = 0
2448 m3u8_count = 0
2449 imgs_count = 0
2450
2451 srcs = set()
2452 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2453 for medium in media:
2454 src = medium.get('src')
2455 if not src or src in srcs:
2456 continue
2457 srcs.add(src)
2458
2459 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2460 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2461 width = int_or_none(medium.get('width'))
2462 height = int_or_none(medium.get('height'))
2463 proto = medium.get('proto')
2464 ext = medium.get('ext')
2465 src_ext = determine_ext(src)
2466 streamer = medium.get('streamer') or base
2467
2468 if proto == 'rtmp' or streamer.startswith('rtmp'):
2469 rtmp_count += 1
2470 formats.append({
2471 'url': streamer,
2472 'play_path': src,
2473 'ext': 'flv',
2474 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2475 'tbr': bitrate,
2476 'filesize': filesize,
2477 'width': width,
2478 'height': height,
2479 })
2480 if transform_rtmp_url:
2481 streamer, src = transform_rtmp_url(streamer, src)
2482 formats[-1].update({
2483 'url': streamer,
2484 'play_path': src,
2485 })
2486 continue
2487
2488 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2489 src_url = src_url.strip()
2490
2491 if proto == 'm3u8' or src_ext == 'm3u8':
2492 m3u8_formats = self._extract_m3u8_formats(
2493 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2494 if len(m3u8_formats) == 1:
2495 m3u8_count += 1
2496 m3u8_formats[0].update({
2497 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2498 'tbr': bitrate,
2499 'width': width,
2500 'height': height,
2501 })
2502 formats.extend(m3u8_formats)
2503 elif src_ext == 'f4m':
2504 f4m_url = src_url
2505 if not f4m_params:
2506 f4m_params = {
2507 'hdcore': '3.2.0',
2508 'plugin': 'flowplayer-3.2.0.1',
2509 }
2510 f4m_url += '&' if '?' in f4m_url else '?'
2511 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2512 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2513 elif src_ext == 'mpd':
2514 formats.extend(self._extract_mpd_formats(
2515 src_url, video_id, mpd_id='dash', fatal=False))
2516 elif re.search(r'\.ism/[Mm]anifest', src_url):
2517 formats.extend(self._extract_ism_formats(
2518 src_url, video_id, ism_id='mss', fatal=False))
2519 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2520 http_count += 1
2521 formats.append({
2522 'url': src_url,
2523 'ext': ext or src_ext or 'flv',
2524 'format_id': 'http-%d' % (bitrate or http_count),
2525 'tbr': bitrate,
2526 'filesize': filesize,
2527 'width': width,
2528 'height': height,
2529 })
2530
2531 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2532 src = medium.get('src')
2533 if not src or src in srcs:
2534 continue
2535 srcs.add(src)
2536
2537 imgs_count += 1
2538 formats.append({
2539 'format_id': 'imagestream-%d' % (imgs_count),
2540 'url': src,
2541 'ext': mimetype2ext(medium.get('type')),
2542 'acodec': 'none',
2543 'vcodec': 'none',
2544 'width': int_or_none(medium.get('width')),
2545 'height': int_or_none(medium.get('height')),
2546 'format_note': 'SMIL storyboards',
2547 })
2548
2549 return formats
2550
2551 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2552 urls = []
2553 subtitles = {}
2554 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2555 src = textstream.get('src')
2556 if not src or src in urls:
2557 continue
2558 urls.append(src)
2559 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2560 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2561 subtitles.setdefault(lang, []).append({
2562 'url': src,
2563 'ext': ext,
2564 })
2565 return subtitles
2566
2567 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2568 xspf = self._download_xml(
2569 xspf_url, playlist_id, 'Downloading xpsf playlist',
2570 'Unable to download xspf manifest', fatal=fatal)
2571 if xspf is False:
2572 return []
2573 return self._parse_xspf(
2574 xspf, playlist_id, xspf_url=xspf_url,
2575 xspf_base_url=base_url(xspf_url))
2576
2577 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2578 NS_MAP = {
2579 'xspf': 'http://xspf.org/ns/0/',
2580 's1': 'http://static.streamone.nl/player/ns/0',
2581 }
2582
2583 entries = []
2584 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2585 title = xpath_text(
2586 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2587 description = xpath_text(
2588 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2589 thumbnail = xpath_text(
2590 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2591 duration = float_or_none(
2592 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2593
2594 formats = []
2595 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2596 format_url = urljoin(xspf_base_url, location.text)
2597 if not format_url:
2598 continue
2599 formats.append({
2600 'url': format_url,
2601 'manifest_url': xspf_url,
2602 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2603 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2604 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2605 })
2606 self._sort_formats(formats)
2607
2608 entries.append({
2609 'id': playlist_id,
2610 'title': title,
2611 'description': description,
2612 'thumbnail': thumbnail,
2613 'duration': duration,
2614 'formats': formats,
2615 })
2616 return entries
2617
2618 def _extract_mpd_formats(self, *args, **kwargs):
2619 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2620 if subs:
2621 self._report_ignoring_subs('DASH')
2622 return fmts
2623
2624 def _extract_mpd_formats_and_subtitles(
2625 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2626 fatal=True, data=None, headers={}, query={}):
2627 res = self._download_xml_handle(
2628 mpd_url, video_id,
2629 note='Downloading MPD manifest' if note is None else note,
2630 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2631 fatal=fatal, data=data, headers=headers, query=query)
2632 if res is False:
2633 return [], {}
2634 mpd_doc, urlh = res
2635 if mpd_doc is None:
2636 return [], {}
2637 mpd_base_url = base_url(urlh.geturl())
2638
2639 return self._parse_mpd_formats_and_subtitles(
2640 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2641
2642 def _parse_mpd_formats(self, *args, **kwargs):
2643 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2644 if subs:
2645 self._report_ignoring_subs('DASH')
2646 return fmts
2647
2648 def _parse_mpd_formats_and_subtitles(
2649 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2650 """
2651 Parse formats from MPD manifest.
2652 References:
2653 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2654 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2655 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2656 """
2657 if not self.get_param('dynamic_mpd', True):
2658 if mpd_doc.get('type') == 'dynamic':
2659 return [], {}
2660
2661 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2662
2663 def _add_ns(path):
2664 return self._xpath_ns(path, namespace)
2665
2666 def is_drm_protected(element):
2667 return element.find(_add_ns('ContentProtection')) is not None
2668
2669 def extract_multisegment_info(element, ms_parent_info):
2670 ms_info = ms_parent_info.copy()
2671
2672 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2673 # common attributes and elements. We will only extract relevant
2674 # for us.
2675 def extract_common(source):
2676 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2677 if segment_timeline is not None:
2678 s_e = segment_timeline.findall(_add_ns('S'))
2679 if s_e:
2680 ms_info['total_number'] = 0
2681 ms_info['s'] = []
2682 for s in s_e:
2683 r = int(s.get('r', 0))
2684 ms_info['total_number'] += 1 + r
2685 ms_info['s'].append({
2686 't': int(s.get('t', 0)),
2687 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2688 'd': int(s.attrib['d']),
2689 'r': r,
2690 })
2691 start_number = source.get('startNumber')
2692 if start_number:
2693 ms_info['start_number'] = int(start_number)
2694 timescale = source.get('timescale')
2695 if timescale:
2696 ms_info['timescale'] = int(timescale)
2697 segment_duration = source.get('duration')
2698 if segment_duration:
2699 ms_info['segment_duration'] = float(segment_duration)
2700
2701 def extract_Initialization(source):
2702 initialization = source.find(_add_ns('Initialization'))
2703 if initialization is not None:
2704 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2705
2706 segment_list = element.find(_add_ns('SegmentList'))
2707 if segment_list is not None:
2708 extract_common(segment_list)
2709 extract_Initialization(segment_list)
2710 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2711 if segment_urls_e:
2712 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2713 else:
2714 segment_template = element.find(_add_ns('SegmentTemplate'))
2715 if segment_template is not None:
2716 extract_common(segment_template)
2717 media = segment_template.get('media')
2718 if media:
2719 ms_info['media'] = media
2720 initialization = segment_template.get('initialization')
2721 if initialization:
2722 ms_info['initialization'] = initialization
2723 else:
2724 extract_Initialization(segment_template)
2725 return ms_info
2726
2727 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2728 formats, subtitles = [], {}
2729 stream_numbers = collections.defaultdict(int)
2730 for period in mpd_doc.findall(_add_ns('Period')):
2731 period_duration = parse_duration(period.get('duration')) or mpd_duration
2732 period_ms_info = extract_multisegment_info(period, {
2733 'start_number': 1,
2734 'timescale': 1,
2735 })
2736 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2737 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2738 for representation in adaptation_set.findall(_add_ns('Representation')):
2739 representation_attrib = adaptation_set.attrib.copy()
2740 representation_attrib.update(representation.attrib)
2741 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2742 mime_type = representation_attrib['mimeType']
2743 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2744
2745 codecs = parse_codecs(representation_attrib.get('codecs', ''))
2746 if content_type not in ('video', 'audio', 'text'):
2747 if mime_type == 'image/jpeg':
2748 content_type = mime_type
2749 elif codecs['vcodec'] != 'none':
2750 content_type = 'video'
2751 elif codecs['acodec'] != 'none':
2752 content_type = 'audio'
2753 elif codecs.get('tcodec', 'none') != 'none':
2754 content_type = 'text'
2755 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2756 content_type = 'text'
2757 else:
2758 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2759 continue
2760
2761 base_url = ''
2762 for element in (representation, adaptation_set, period, mpd_doc):
2763 base_url_e = element.find(_add_ns('BaseURL'))
2764 if base_url_e is not None:
2765 base_url = base_url_e.text + base_url
2766 if re.match(r'^https?://', base_url):
2767 break
2768 if mpd_base_url and base_url.startswith('/'):
2769 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2770 elif mpd_base_url and not re.match(r'^https?://', base_url):
2771 if not mpd_base_url.endswith('/'):
2772 mpd_base_url += '/'
2773 base_url = mpd_base_url + base_url
2774 representation_id = representation_attrib.get('id')
2775 lang = representation_attrib.get('lang')
2776 url_el = representation.find(_add_ns('BaseURL'))
2777 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2778 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2779 if representation_id is not None:
2780 format_id = representation_id
2781 else:
2782 format_id = content_type
2783 if mpd_id:
2784 format_id = mpd_id + '-' + format_id
2785 if content_type in ('video', 'audio'):
2786 f = {
2787 'format_id': format_id,
2788 'manifest_url': mpd_url,
2789 'ext': mimetype2ext(mime_type),
2790 'width': int_or_none(representation_attrib.get('width')),
2791 'height': int_or_none(representation_attrib.get('height')),
2792 'tbr': float_or_none(bandwidth, 1000),
2793 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2794 'fps': int_or_none(representation_attrib.get('frameRate')),
2795 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2796 'format_note': 'DASH %s' % content_type,
2797 'filesize': filesize,
2798 'container': mimetype2ext(mime_type) + '_dash',
2799 **codecs
2800 }
2801 elif content_type == 'text':
2802 f = {
2803 'ext': mimetype2ext(mime_type),
2804 'manifest_url': mpd_url,
2805 'filesize': filesize,
2806 }
2807 elif content_type == 'image/jpeg':
2808 # See test case in VikiIE
2809 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2810 f = {
2811 'format_id': format_id,
2812 'ext': 'mhtml',
2813 'manifest_url': mpd_url,
2814 'format_note': 'DASH storyboards (jpeg)',
2815 'acodec': 'none',
2816 'vcodec': 'none',
2817 }
2818 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2819 f['has_drm'] = True
2820 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2821
2822 def prepare_template(template_name, identifiers):
2823 tmpl = representation_ms_info[template_name]
2824 # First of, % characters outside $...$ templates
2825 # must be escaped by doubling for proper processing
2826 # by % operator string formatting used further (see
2827 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2828 t = ''
2829 in_template = False
2830 for c in tmpl:
2831 t += c
2832 if c == '$':
2833 in_template = not in_template
2834 elif c == '%' and not in_template:
2835 t += c
2836 # Next, $...$ templates are translated to their
2837 # %(...) counterparts to be used with % operator
2838 if representation_id is not None:
2839 t = t.replace('$RepresentationID$', representation_id)
2840 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2841 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2842 t.replace('$$', '$')
2843 return t
2844
2845 # @initialization is a regular template like @media one
2846 # so it should be handled just the same way (see
2847 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2848 if 'initialization' in representation_ms_info:
2849 initialization_template = prepare_template(
2850 'initialization',
2851 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2852 # $Time$ shall not be included for @initialization thus
2853 # only $Bandwidth$ remains
2854 ('Bandwidth', ))
2855 representation_ms_info['initialization_url'] = initialization_template % {
2856 'Bandwidth': bandwidth,
2857 }
2858
2859 def location_key(location):
2860 return 'url' if re.match(r'^https?://', location) else 'path'
2861
2862 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2863
2864 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2865 media_location_key = location_key(media_template)
2866
2867 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2868 # can't be used at the same time
2869 if '%(Number' in media_template and 's' not in representation_ms_info:
2870 segment_duration = None
2871 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2872 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2873 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2874 representation_ms_info['fragments'] = [{
2875 media_location_key: media_template % {
2876 'Number': segment_number,
2877 'Bandwidth': bandwidth,
2878 },
2879 'duration': segment_duration,
2880 } for segment_number in range(
2881 representation_ms_info['start_number'],
2882 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2883 else:
2884 # $Number*$ or $Time$ in media template with S list available
2885 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2886 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2887 representation_ms_info['fragments'] = []
2888 segment_time = 0
2889 segment_d = None
2890 segment_number = representation_ms_info['start_number']
2891
2892 def add_segment_url():
2893 segment_url = media_template % {
2894 'Time': segment_time,
2895 'Bandwidth': bandwidth,
2896 'Number': segment_number,
2897 }
2898 representation_ms_info['fragments'].append({
2899 media_location_key: segment_url,
2900 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2901 })
2902
2903 for num, s in enumerate(representation_ms_info['s']):
2904 segment_time = s.get('t') or segment_time
2905 segment_d = s['d']
2906 add_segment_url()
2907 segment_number += 1
2908 for r in range(s.get('r', 0)):
2909 segment_time += segment_d
2910 add_segment_url()
2911 segment_number += 1
2912 segment_time += segment_d
2913 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2914 # No media template
2915 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2916 # or any YouTube dashsegments video
2917 fragments = []
2918 segment_index = 0
2919 timescale = representation_ms_info['timescale']
2920 for s in representation_ms_info['s']:
2921 duration = float_or_none(s['d'], timescale)
2922 for r in range(s.get('r', 0) + 1):
2923 segment_uri = representation_ms_info['segment_urls'][segment_index]
2924 fragments.append({
2925 location_key(segment_uri): segment_uri,
2926 'duration': duration,
2927 })
2928 segment_index += 1
2929 representation_ms_info['fragments'] = fragments
2930 elif 'segment_urls' in representation_ms_info:
2931 # Segment URLs with no SegmentTimeline
2932 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2933 # https://github.com/ytdl-org/youtube-dl/pull/14844
2934 fragments = []
2935 segment_duration = float_or_none(
2936 representation_ms_info['segment_duration'],
2937 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2938 for segment_url in representation_ms_info['segment_urls']:
2939 fragment = {
2940 location_key(segment_url): segment_url,
2941 }
2942 if segment_duration:
2943 fragment['duration'] = segment_duration
2944 fragments.append(fragment)
2945 representation_ms_info['fragments'] = fragments
2946 # If there is a fragments key available then we correctly recognized fragmented media.
2947 # Otherwise we will assume unfragmented media with direct access. Technically, such
2948 # assumption is not necessarily correct since we may simply have no support for
2949 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2950 if 'fragments' in representation_ms_info:
2951 f.update({
2952 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2953 'url': mpd_url or base_url,
2954 'fragment_base_url': base_url,
2955 'fragments': [],
2956 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2957 })
2958 if 'initialization_url' in representation_ms_info:
2959 initialization_url = representation_ms_info['initialization_url']
2960 if not f.get('url'):
2961 f['url'] = initialization_url
2962 f['fragments'].append({location_key(initialization_url): initialization_url})
2963 f['fragments'].extend(representation_ms_info['fragments'])
2964 else:
2965 # Assuming direct URL to unfragmented media.
2966 f['url'] = base_url
2967 if content_type in ('video', 'audio', 'image/jpeg'):
2968 f['manifest_stream_number'] = stream_numbers[f['url']]
2969 stream_numbers[f['url']] += 1
2970 formats.append(f)
2971 elif content_type == 'text':
2972 subtitles.setdefault(lang or 'und', []).append(f)
2973
2974 return formats, subtitles
2975
2976 def _extract_ism_formats(self, *args, **kwargs):
2977 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2978 if subs:
2979 self._report_ignoring_subs('ISM')
2980 return fmts
2981
2982 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2983 res = self._download_xml_handle(
2984 ism_url, video_id,
2985 note='Downloading ISM manifest' if note is None else note,
2986 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2987 fatal=fatal, data=data, headers=headers, query=query)
2988 if res is False:
2989 return [], {}
2990 ism_doc, urlh = res
2991 if ism_doc is None:
2992 return [], {}
2993
2994 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2995
2996 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2997 """
2998 Parse formats from ISM manifest.
2999 References:
3000 1. [MS-SSTR]: Smooth Streaming Protocol,
3001 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3002 """
3003 if ism_doc.get('IsLive') == 'TRUE':
3004 return [], {}
3005
3006 duration = int(ism_doc.attrib['Duration'])
3007 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3008
3009 formats = []
3010 subtitles = {}
3011 for stream in ism_doc.findall('StreamIndex'):
3012 stream_type = stream.get('Type')
3013 if stream_type not in ('video', 'audio', 'text'):
3014 continue
3015 url_pattern = stream.attrib['Url']
3016 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3017 stream_name = stream.get('Name')
3018 stream_language = stream.get('Language', 'und')
3019 for track in stream.findall('QualityLevel'):
3020 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3021 # TODO: add support for WVC1 and WMAP
3022 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3023 self.report_warning('%s is not a supported codec' % fourcc)
3024 continue
3025 tbr = int(track.attrib['Bitrate']) // 1000
3026 # [1] does not mention Width and Height attributes. However,
3027 # they're often present while MaxWidth and MaxHeight are
3028 # missing, so should be used as fallbacks
3029 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3030 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3031 sampling_rate = int_or_none(track.get('SamplingRate'))
3032
3033 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3034 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3035
3036 fragments = []
3037 fragment_ctx = {
3038 'time': 0,
3039 }
3040 stream_fragments = stream.findall('c')
3041 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3042 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3043 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3044 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3045 if not fragment_ctx['duration']:
3046 try:
3047 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3048 except IndexError:
3049 next_fragment_time = duration
3050 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3051 for _ in range(fragment_repeat):
3052 fragments.append({
3053 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3054 'duration': fragment_ctx['duration'] / stream_timescale,
3055 })
3056 fragment_ctx['time'] += fragment_ctx['duration']
3057
3058 if stream_type == 'text':
3059 subtitles.setdefault(stream_language, []).append({
3060 'ext': 'ismt',
3061 'protocol': 'ism',
3062 'url': ism_url,
3063 'manifest_url': ism_url,
3064 'fragments': fragments,
3065 '_download_params': {
3066 'stream_type': stream_type,
3067 'duration': duration,
3068 'timescale': stream_timescale,
3069 'fourcc': fourcc,
3070 'language': stream_language,
3071 'codec_private_data': track.get('CodecPrivateData'),
3072 }
3073 })
3074 elif stream_type in ('video', 'audio'):
3075 formats.append({
3076 'format_id': join_nonempty(ism_id, stream_name, tbr),
3077 'url': ism_url,
3078 'manifest_url': ism_url,
3079 'ext': 'ismv' if stream_type == 'video' else 'isma',
3080 'width': width,
3081 'height': height,
3082 'tbr': tbr,
3083 'asr': sampling_rate,
3084 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3085 'acodec': 'none' if stream_type == 'video' else fourcc,
3086 'protocol': 'ism',
3087 'fragments': fragments,
3088 'has_drm': ism_doc.find('Protection') is not None,
3089 '_download_params': {
3090 'stream_type': stream_type,
3091 'duration': duration,
3092 'timescale': stream_timescale,
3093 'width': width or 0,
3094 'height': height or 0,
3095 'fourcc': fourcc,
3096 'language': stream_language,
3097 'codec_private_data': track.get('CodecPrivateData'),
3098 'sampling_rate': sampling_rate,
3099 'channels': int_or_none(track.get('Channels', 2)),
3100 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3101 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3102 },
3103 })
3104 return formats, subtitles
3105
3106 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3107 def absolute_url(item_url):
3108 return urljoin(base_url, item_url)
3109
3110 def parse_content_type(content_type):
3111 if not content_type:
3112 return {}
3113 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3114 if ctr:
3115 mimetype, codecs = ctr.groups()
3116 f = parse_codecs(codecs)
3117 f['ext'] = mimetype2ext(mimetype)
3118 return f
3119 return {}
3120
3121 def _media_formats(src, cur_media_type, type_info={}):
3122 full_url = absolute_url(src)
3123 ext = type_info.get('ext') or determine_ext(full_url)
3124 if ext == 'm3u8':
3125 is_plain_url = False
3126 formats = self._extract_m3u8_formats(
3127 full_url, video_id, ext='mp4',
3128 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3129 preference=preference, quality=quality, fatal=False)
3130 elif ext == 'mpd':
3131 is_plain_url = False
3132 formats = self._extract_mpd_formats(
3133 full_url, video_id, mpd_id=mpd_id, fatal=False)
3134 else:
3135 is_plain_url = True
3136 formats = [{
3137 'url': full_url,
3138 'vcodec': 'none' if cur_media_type == 'audio' else None,
3139 }]
3140 return is_plain_url, formats
3141
3142 entries = []
3143 # amp-video and amp-audio are very similar to their HTML5 counterparts
3144 # so we wll include them right here (see
3145 # https://www.ampproject.org/docs/reference/components/amp-video)
3146 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3147 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3148 media_tags = [(media_tag, media_tag_name, media_type, '')
3149 for media_tag, media_tag_name, media_type
3150 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3151 media_tags.extend(re.findall(
3152 # We only allow video|audio followed by a whitespace or '>'.
3153 # Allowing more characters may end up in significant slow down (see
3154 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3155 # http://www.porntrex.com/maps/videositemap.xml).
3156 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3157 for media_tag, _, media_type, media_content in media_tags:
3158 media_info = {
3159 'formats': [],
3160 'subtitles': {},
3161 }
3162 media_attributes = extract_attributes(media_tag)
3163 src = strip_or_none(media_attributes.get('src'))
3164 if src:
3165 _, formats = _media_formats(src, media_type)
3166 media_info['formats'].extend(formats)
3167 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3168 if media_content:
3169 for source_tag in re.findall(r'<source[^>]+>', media_content):
3170 s_attr = extract_attributes(source_tag)
3171 # data-video-src and data-src are non standard but seen
3172 # several times in the wild
3173 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3174 if not src:
3175 continue
3176 f = parse_content_type(s_attr.get('type'))
3177 is_plain_url, formats = _media_formats(src, media_type, f)
3178 if is_plain_url:
3179 # width, height, res, label and title attributes are
3180 # all not standard but seen several times in the wild
3181 labels = [
3182 s_attr.get(lbl)
3183 for lbl in ('label', 'title')
3184 if str_or_none(s_attr.get(lbl))
3185 ]
3186 width = int_or_none(s_attr.get('width'))
3187 height = (int_or_none(s_attr.get('height'))
3188 or int_or_none(s_attr.get('res')))
3189 if not width or not height:
3190 for lbl in labels:
3191 resolution = parse_resolution(lbl)
3192 if not resolution:
3193 continue
3194 width = width or resolution.get('width')
3195 height = height or resolution.get('height')
3196 for lbl in labels:
3197 tbr = parse_bitrate(lbl)
3198 if tbr:
3199 break
3200 else:
3201 tbr = None
3202 f.update({
3203 'width': width,
3204 'height': height,
3205 'tbr': tbr,
3206 'format_id': s_attr.get('label') or s_attr.get('title'),
3207 })
3208 f.update(formats[0])
3209 media_info['formats'].append(f)
3210 else:
3211 media_info['formats'].extend(formats)
3212 for track_tag in re.findall(r'<track[^>]+>', media_content):
3213 track_attributes = extract_attributes(track_tag)
3214 kind = track_attributes.get('kind')
3215 if not kind or kind in ('subtitles', 'captions'):
3216 src = strip_or_none(track_attributes.get('src'))
3217 if not src:
3218 continue
3219 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3220 media_info['subtitles'].setdefault(lang, []).append({
3221 'url': absolute_url(src),
3222 })
3223 for f in media_info['formats']:
3224 f.setdefault('http_headers', {})['Referer'] = base_url
3225 if media_info['formats'] or media_info['subtitles']:
3226 entries.append(media_info)
3227 return entries
3228
3229 def _extract_akamai_formats(self, *args, **kwargs):
3230 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3231 if subs:
3232 self._report_ignoring_subs('akamai')
3233 return fmts
3234
3235 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3236 signed = 'hdnea=' in manifest_url
3237 if not signed:
3238 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3239 manifest_url = re.sub(
3240 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3241 '', manifest_url).strip('?')
3242
3243 formats = []
3244 subtitles = {}
3245
3246 hdcore_sign = 'hdcore=3.7.0'
3247 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3248 hds_host = hosts.get('hds')
3249 if hds_host:
3250 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3251 if 'hdcore=' not in f4m_url:
3252 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3253 f4m_formats = self._extract_f4m_formats(
3254 f4m_url, video_id, f4m_id='hds', fatal=False)
3255 for entry in f4m_formats:
3256 entry.update({'extra_param_to_segment_url': hdcore_sign})
3257 formats.extend(f4m_formats)
3258
3259 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3260 hls_host = hosts.get('hls')
3261 if hls_host:
3262 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3263 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3264 m3u8_url, video_id, 'mp4', 'm3u8_native',
3265 m3u8_id='hls', fatal=False)
3266 formats.extend(m3u8_formats)
3267 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3268
3269 http_host = hosts.get('http')
3270 if http_host and m3u8_formats and not signed:
3271 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3272 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3273 qualities_length = len(qualities)
3274 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3275 i = 0
3276 for f in m3u8_formats:
3277 if f['vcodec'] != 'none':
3278 for protocol in ('http', 'https'):
3279 http_f = f.copy()
3280 del http_f['manifest_url']
3281 http_url = re.sub(
3282 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3283 http_f.update({
3284 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3285 'url': http_url,
3286 'protocol': protocol,
3287 })
3288 formats.append(http_f)
3289 i += 1
3290
3291 return formats, subtitles
3292
3293 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3294 query = compat_urlparse.urlparse(url).query
3295 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3296 mobj = re.search(
3297 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3298 url_base = mobj.group('url')
3299 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3300 formats = []
3301
3302 def manifest_url(manifest):
3303 m_url = '%s/%s' % (http_base_url, manifest)
3304 if query:
3305 m_url += '?%s' % query
3306 return m_url
3307
3308 if 'm3u8' not in skip_protocols:
3309 formats.extend(self._extract_m3u8_formats(
3310 manifest_url('playlist.m3u8'), video_id, 'mp4',
3311 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3312 if 'f4m' not in skip_protocols:
3313 formats.extend(self._extract_f4m_formats(
3314 manifest_url('manifest.f4m'),
3315 video_id, f4m_id='hds', fatal=False))
3316 if 'dash' not in skip_protocols:
3317 formats.extend(self._extract_mpd_formats(
3318 manifest_url('manifest.mpd'),
3319 video_id, mpd_id='dash', fatal=False))
3320 if re.search(r'(?:/smil:|\.smil)', url_base):
3321 if 'smil' not in skip_protocols:
3322 rtmp_formats = self._extract_smil_formats(
3323 manifest_url('jwplayer.smil'),
3324 video_id, fatal=False)
3325 for rtmp_format in rtmp_formats:
3326 rtsp_format = rtmp_format.copy()
3327 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3328 del rtsp_format['play_path']
3329 del rtsp_format['ext']
3330 rtsp_format.update({
3331 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3332 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3333 'protocol': 'rtsp',
3334 })
3335 formats.extend([rtmp_format, rtsp_format])
3336 else:
3337 for protocol in ('rtmp', 'rtsp'):
3338 if protocol not in skip_protocols:
3339 formats.append({
3340 'url': '%s:%s' % (protocol, url_base),
3341 'format_id': protocol,
3342 'protocol': protocol,
3343 })
3344 return formats
3345
3346 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3347 mobj = re.search(
3348 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3349 webpage)
3350 if mobj:
3351 try:
3352 jwplayer_data = self._parse_json(mobj.group('options'),
3353 video_id=video_id,
3354 transform_source=transform_source)
3355 except ExtractorError:
3356 pass
3357 else:
3358 if isinstance(jwplayer_data, dict):
3359 return jwplayer_data
3360
3361 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3362 jwplayer_data = self._find_jwplayer_data(
3363 webpage, video_id, transform_source=js_to_json)
3364 return self._parse_jwplayer_data(
3365 jwplayer_data, video_id, *args, **kwargs)
3366
3367 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3368 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3369 # JWPlayer backward compatibility: flattened playlists
3370 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3371 if 'playlist' not in jwplayer_data:
3372 jwplayer_data = {'playlist': [jwplayer_data]}
3373
3374 entries = []
3375
3376 # JWPlayer backward compatibility: single playlist item
3377 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3378 if not isinstance(jwplayer_data['playlist'], list):
3379 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3380
3381 for video_data in jwplayer_data['playlist']:
3382 # JWPlayer backward compatibility: flattened sources
3383 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3384 if 'sources' not in video_data:
3385 video_data['sources'] = [video_data]
3386
3387 this_video_id = video_id or video_data['mediaid']
3388
3389 formats = self._parse_jwplayer_formats(
3390 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3391 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3392
3393 subtitles = {}
3394 tracks = video_data.get('tracks')
3395 if tracks and isinstance(tracks, list):
3396 for track in tracks:
3397 if not isinstance(track, dict):
3398 continue
3399 track_kind = track.get('kind')
3400 if not track_kind or not isinstance(track_kind, compat_str):
3401 continue
3402 if track_kind.lower() not in ('captions', 'subtitles'):
3403 continue
3404 track_url = urljoin(base_url, track.get('file'))
3405 if not track_url:
3406 continue
3407 subtitles.setdefault(track.get('label') or 'en', []).append({
3408 'url': self._proto_relative_url(track_url)
3409 })
3410
3411 entry = {
3412 'id': this_video_id,
3413 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3414 'description': clean_html(video_data.get('description')),
3415 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3416 'timestamp': int_or_none(video_data.get('pubdate')),
3417 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3418 'subtitles': subtitles,
3419 }
3420 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3421 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3422 entry.update({
3423 '_type': 'url_transparent',
3424 'url': formats[0]['url'],
3425 })
3426 else:
3427 self._sort_formats(formats)
3428 entry['formats'] = formats
3429 entries.append(entry)
3430 if len(entries) == 1:
3431 return entries[0]
3432 else:
3433 return self.playlist_result(entries)
3434
3435 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3436 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3437 urls = []
3438 formats = []
3439 for source in jwplayer_sources_data:
3440 if not isinstance(source, dict):
3441 continue
3442 source_url = urljoin(
3443 base_url, self._proto_relative_url(source.get('file')))
3444 if not source_url or source_url in urls:
3445 continue
3446 urls.append(source_url)
3447 source_type = source.get('type') or ''
3448 ext = mimetype2ext(source_type) or determine_ext(source_url)
3449 if source_type == 'hls' or ext == 'm3u8':
3450 formats.extend(self._extract_m3u8_formats(
3451 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3452 m3u8_id=m3u8_id, fatal=False))
3453 elif source_type == 'dash' or ext == 'mpd':
3454 formats.extend(self._extract_mpd_formats(
3455 source_url, video_id, mpd_id=mpd_id, fatal=False))
3456 elif ext == 'smil':
3457 formats.extend(self._extract_smil_formats(
3458 source_url, video_id, fatal=False))
3459 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3460 elif source_type.startswith('audio') or ext in (
3461 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3462 formats.append({
3463 'url': source_url,
3464 'vcodec': 'none',
3465 'ext': ext,
3466 })
3467 else:
3468 height = int_or_none(source.get('height'))
3469 if height is None:
3470 # Often no height is provided but there is a label in
3471 # format like "1080p", "720p SD", or 1080.
3472 height = int_or_none(self._search_regex(
3473 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3474 'height', default=None))
3475 a_format = {
3476 'url': source_url,
3477 'width': int_or_none(source.get('width')),
3478 'height': height,
3479 'tbr': int_or_none(source.get('bitrate')),
3480 'ext': ext,
3481 }
3482 if source_url.startswith('rtmp'):
3483 a_format['ext'] = 'flv'
3484 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3485 # of jwplayer.flash.swf
3486 rtmp_url_parts = re.split(
3487 r'((?:mp4|mp3|flv):)', source_url, 1)
3488 if len(rtmp_url_parts) == 3:
3489 rtmp_url, prefix, play_path = rtmp_url_parts
3490 a_format.update({
3491 'url': rtmp_url,
3492 'play_path': prefix + play_path,
3493 })
3494 if rtmp_params:
3495 a_format.update(rtmp_params)
3496 formats.append(a_format)
3497 return formats
3498
3499 def _live_title(self, name):
3500 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3501 return name
3502
3503 def _int(self, v, name, fatal=False, **kwargs):
3504 res = int_or_none(v, **kwargs)
3505 if 'get_attr' in kwargs:
3506 print(getattr(v, kwargs['get_attr']))
3507 if res is None:
3508 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3509 if fatal:
3510 raise ExtractorError(msg)
3511 else:
3512 self.report_warning(msg)
3513 return res
3514
3515 def _float(self, v, name, fatal=False, **kwargs):
3516 res = float_or_none(v, **kwargs)
3517 if res is None:
3518 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3519 if fatal:
3520 raise ExtractorError(msg)
3521 else:
3522 self.report_warning(msg)
3523 return res
3524
3525 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3526 path='/', secure=False, discard=False, rest={}, **kwargs):
3527 cookie = compat_cookiejar_Cookie(
3528 0, name, value, port, port is not None, domain, True,
3529 domain.startswith('.'), path, True, secure, expire_time,
3530 discard, None, None, rest)
3531 self._downloader.cookiejar.set_cookie(cookie)
3532
3533 def _get_cookies(self, url):
3534 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3535 req = sanitized_Request(url)
3536 self._downloader.cookiejar.add_cookie_header(req)
3537 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3538
3539 def _apply_first_set_cookie_header(self, url_handle, cookie):
3540 """
3541 Apply first Set-Cookie header instead of the last. Experimental.
3542
3543 Some sites (e.g. [1-3]) may serve two cookies under the same name
3544 in Set-Cookie header and expect the first (old) one to be set rather
3545 than second (new). However, as of RFC6265 the newer one cookie
3546 should be set into cookie store what actually happens.
3547 We will workaround this issue by resetting the cookie to
3548 the first one manually.
3549 1. https://new.vk.com/
3550 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3551 3. https://learning.oreilly.com/
3552 """
3553 for header, cookies in url_handle.headers.items():
3554 if header.lower() != 'set-cookie':
3555 continue
3556 if sys.version_info[0] >= 3:
3557 cookies = cookies.encode('iso-8859-1')
3558 cookies = cookies.decode('utf-8')
3559 cookie_value = re.search(
3560 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3561 if cookie_value:
3562 value, domain = cookie_value.groups()
3563 self._set_cookie(domain, cookie, value)
3564 break
3565
3566 def get_testcases(self, include_onlymatching=False):
3567 t = getattr(self, '_TEST', None)
3568 if t:
3569 assert not hasattr(self, '_TESTS'), \
3570 '%s has _TEST and _TESTS' % type(self).__name__
3571 tests = [t]
3572 else:
3573 tests = getattr(self, '_TESTS', [])
3574 for t in tests:
3575 if not include_onlymatching and t.get('only_matching', False):
3576 continue
3577 t['name'] = type(self).__name__[:-len('IE')]
3578 yield t
3579
3580 def is_suitable(self, age_limit):
3581 """ Test whether the extractor is generally suitable for the given
3582 age limit (i.e. pornographic sites are not, all others usually are) """
3583
3584 any_restricted = False
3585 for tc in self.get_testcases(include_onlymatching=False):
3586 if tc.get('playlist', []):
3587 tc = tc['playlist'][0]
3588 is_restricted = age_restricted(
3589 tc.get('info_dict', {}).get('age_limit'), age_limit)
3590 if not is_restricted:
3591 return True
3592 any_restricted = any_restricted or is_restricted
3593 return not any_restricted
3594
3595 def extract_subtitles(self, *args, **kwargs):
3596 if (self.get_param('writesubtitles', False)
3597 or self.get_param('listsubtitles')):
3598 return self._get_subtitles(*args, **kwargs)
3599 return {}
3600
3601 def _get_subtitles(self, *args, **kwargs):
3602 raise NotImplementedError('This method must be implemented by subclasses')
3603
3604 def extract_comments(self, *args, **kwargs):
3605 if not self.get_param('getcomments'):
3606 return None
3607 generator = self._get_comments(*args, **kwargs)
3608
3609 def extractor():
3610 comments = []
3611 interrupted = True
3612 try:
3613 while True:
3614 comments.append(next(generator))
3615 except StopIteration:
3616 interrupted = False
3617 except KeyboardInterrupt:
3618 self.to_screen('Interrupted by user')
3619 except Exception as e:
3620 if self.get_param('ignoreerrors') is not True:
3621 raise
3622 self._downloader.report_error(e)
3623 comment_count = len(comments)
3624 self.to_screen(f'Extracted {comment_count} comments')
3625 return {
3626 'comments': comments,
3627 'comment_count': None if interrupted else comment_count
3628 }
3629 return extractor
3630
3631 def _get_comments(self, *args, **kwargs):
3632 raise NotImplementedError('This method must be implemented by subclasses')
3633
3634 @staticmethod
3635 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3636 """ Merge subtitle items for one language. Items with duplicated URLs
3637 will be dropped. """
3638 list1_urls = set([item['url'] for item in subtitle_list1])
3639 ret = list(subtitle_list1)
3640 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3641 return ret
3642
3643 @classmethod
3644 def _merge_subtitles(cls, *dicts, target=None):
3645 """ Merge subtitle dictionaries, language by language. """
3646 if target is None:
3647 target = {}
3648 for d in dicts:
3649 for lang, subs in d.items():
3650 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3651 return target
3652
3653 def extract_automatic_captions(self, *args, **kwargs):
3654 if (self.get_param('writeautomaticsub', False)
3655 or self.get_param('listsubtitles')):
3656 return self._get_automatic_captions(*args, **kwargs)
3657 return {}
3658
3659 def _get_automatic_captions(self, *args, **kwargs):
3660 raise NotImplementedError('This method must be implemented by subclasses')
3661
3662 def mark_watched(self, *args, **kwargs):
3663 if not self.get_param('mark_watched', False):
3664 return
3665 if (self._get_login_info()[0] is not None
3666 or self.get_param('cookiefile')
3667 or self.get_param('cookiesfrombrowser')):
3668 self._mark_watched(*args, **kwargs)
3669
3670 def _mark_watched(self, *args, **kwargs):
3671 raise NotImplementedError('This method must be implemented by subclasses')
3672
3673 def geo_verification_headers(self):
3674 headers = {}
3675 geo_verification_proxy = self.get_param('geo_verification_proxy')
3676 if geo_verification_proxy:
3677 headers['Ytdl-request-proxy'] = geo_verification_proxy
3678 return headers
3679
3680 def _generic_id(self, url):
3681 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3682
3683 def _generic_title(self, url):
3684 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3685
3686 @staticmethod
3687 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3688 all_known = all(map(
3689 lambda x: x is not None,
3690 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3691 return (
3692 'private' if is_private
3693 else 'premium_only' if needs_premium
3694 else 'subscriber_only' if needs_subscription
3695 else 'needs_auth' if needs_auth
3696 else 'unlisted' if is_unlisted
3697 else 'public' if all_known
3698 else None)
3699
3700 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3701 '''
3702 @returns A list of values for the extractor argument given by "key"
3703 or "default" if no such key is present
3704 @param default The default value to return when the key is not present (default: [])
3705 @param casesense When false, the values are converted to lower case
3706 '''
3707 val = traverse_obj(
3708 self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3709 if val is None:
3710 return [] if default is NO_DEFAULT else default
3711 return list(val) if casesense else [x.lower() for x in val]
3712
3713
3714 class SearchInfoExtractor(InfoExtractor):
3715 """
3716 Base class for paged search queries extractors.
3717 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3718 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3719 """
3720
3721 _MAX_RESULTS = float('inf')
3722
3723 @classmethod
3724 def _make_valid_url(cls):
3725 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3726
3727 def _real_extract(self, query):
3728 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3729 if prefix == '':
3730 return self._get_n_results(query, 1)
3731 elif prefix == 'all':
3732 return self._get_n_results(query, self._MAX_RESULTS)
3733 else:
3734 n = int(prefix)
3735 if n <= 0:
3736 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3737 elif n > self._MAX_RESULTS:
3738 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3739 n = self._MAX_RESULTS
3740 return self._get_n_results(query, n)
3741
3742 def _get_n_results(self, query, n):
3743 """Get a specified number of results for a query.
3744 Either this function or _search_results must be overridden by subclasses """
3745 return self.playlist_result(
3746 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3747 query, query)
3748
3749 def _search_results(self, query):
3750 """Returns an iterator of search results"""
3751 raise NotImplementedError('This method must be implemented by subclasses')
3752
3753 @property
3754 def SEARCH_KEY(self):
3755 return self._SEARCH_KEY