]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
Expand `--check-formats` to thumbnails
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17 compat_cookiejar_Cookie,
18 compat_cookies_SimpleCookie,
19 compat_etree_Element,
20 compat_etree_fromstring,
21 compat_getpass,
22 compat_integer_types,
23 compat_http_client,
24 compat_os_name,
25 compat_str,
26 compat_urllib_error,
27 compat_urllib_parse_unquote,
28 compat_urllib_parse_urlencode,
29 compat_urllib_request,
30 compat_urlparse,
31 compat_xml_parse_error,
32 )
33 from ..downloader import FileDownloader
34 from ..downloader.f4m import (
35 get_base_url,
36 remove_encrypted_media,
37 )
38 from ..utils import (
39 NO_DEFAULT,
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 dict_get,
48 error_to_compat_str,
49 ExtractorError,
50 extract_attributes,
51 fix_xml_ampersands,
52 float_or_none,
53 GeoRestrictedError,
54 GeoUtils,
55 int_or_none,
56 js_to_json,
57 JSON_LD_RE,
58 mimetype2ext,
59 network_exceptions,
60 orderedSet,
61 parse_bitrate,
62 parse_codecs,
63 parse_duration,
64 parse_iso8601,
65 parse_m3u8_attributes,
66 parse_resolution,
67 RegexNotFoundError,
68 sanitized_Request,
69 sanitize_filename,
70 str_or_none,
71 str_to_int,
72 strip_or_none,
73 unescapeHTML,
74 unified_strdate,
75 unified_timestamp,
76 update_Request,
77 update_url_query,
78 urljoin,
79 url_basename,
80 url_or_none,
81 xpath_element,
82 xpath_text,
83 xpath_with_ns,
84 )
85
86
87 class InfoExtractor(object):
88 """Information Extractor class.
89
90 Information extractors are the classes that, given a URL, extract
91 information about the video (or videos) the URL refers to. This
92 information includes the real video URL, the video title, author and
93 others. The information is stored in a dictionary which is then
94 passed to the YoutubeDL. The YoutubeDL processes this
95 information possibly downloading the video to the file system, among
96 other possible outcomes.
97
98 The type field determines the type of the result.
99 By far the most common value (and the default if _type is missing) is
100 "video", which indicates a single video.
101
102 For a video, the dictionaries must include the following fields:
103
104 id: Video identifier.
105 title: Video title, unescaped.
106
107 Additionally, it must contain either a formats entry or a url one:
108
109 formats: A list of dictionaries for each format available, ordered
110 from worst to best quality.
111
112 Potential fields:
113 * url The mandatory URL representing the media:
114 for plain file media - HTTP URL of this file,
115 for RTMP - RTMP URL,
116 for HLS - URL of the M3U8 media playlist,
117 for HDS - URL of the F4M manifest,
118 for DASH
119 - HTTP URL to plain file media (in case of
120 unfragmented media)
121 - URL of the MPD manifest or base URL
122 representing the media if MPD manifest
123 is parsed from a string (in case of
124 fragmented media)
125 for MSS - URL of the ISM manifest.
126 * manifest_url
127 The URL of the manifest file in case of
128 fragmented media:
129 for HLS - URL of the M3U8 master playlist,
130 for HDS - URL of the F4M manifest,
131 for DASH - URL of the MPD manifest,
132 for MSS - URL of the ISM manifest.
133 * ext Will be calculated from URL if missing
134 * format A human-readable description of the format
135 ("mp4 container with h264/opus").
136 Calculated from the format_id, width, height.
137 and format_note fields if missing.
138 * format_id A short description of the format
139 ("mp4_h264_opus" or "19").
140 Technically optional, but strongly recommended.
141 * format_note Additional info about the format
142 ("3D" or "DASH video")
143 * width Width of the video, if known
144 * height Height of the video, if known
145 * resolution Textual description of width and height
146 * tbr Average bitrate of audio and video in KBit/s
147 * abr Average audio bitrate in KBit/s
148 * acodec Name of the audio codec in use
149 * asr Audio sampling rate in Hertz
150 * vbr Average video bitrate in KBit/s
151 * fps Frame rate
152 * vcodec Name of the video codec in use
153 * container Name of the container format
154 * filesize The number of bytes, if known in advance
155 * filesize_approx An estimate for the number of bytes
156 * player_url SWF Player URL (used for rtmpdump).
157 * protocol The protocol that will be used for the actual
158 download, lower-case.
159 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
160 "m3u8", "m3u8_native" or "http_dash_segments".
161 * fragment_base_url
162 Base URL for fragments. Each fragment's path
163 value (if present) will be relative to
164 this URL.
165 * fragments A list of fragments of a fragmented media.
166 Each fragment entry must contain either an url
167 or a path. If an url is present it should be
168 considered by a client. Otherwise both path and
169 fragment_base_url must be present. Here is
170 the list of all potential fields:
171 * "url" - fragment's URL
172 * "path" - fragment's path relative to
173 fragment_base_url
174 * "duration" (optional, int or float)
175 * "filesize" (optional, int)
176 * preference Order number of this format. If this field is
177 present and not None, the formats get sorted
178 by this field, regardless of all other values.
179 -1 for default (order by other properties),
180 -2 or smaller for less than default.
181 < -1000 to hide the format (if there is
182 another one which is strictly better)
183 * language Language code, e.g. "de" or "en-US".
184 * language_preference Is this in the language mentioned in
185 the URL?
186 10 if it's what the URL is about,
187 -1 for default (don't know),
188 -10 otherwise, other values reserved for now.
189 * quality Order number of the video quality of this
190 format, irrespective of the file format.
191 -1 for default (order by other properties),
192 -2 or smaller for less than default.
193 * source_preference Order number for this video source
194 (quality takes higher priority)
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
197 * http_headers A dictionary of additional HTTP headers
198 to add to the request.
199 * stretched_ratio If given and not 1, indicates that the
200 video's pixels are not square.
201 width : height ratio as float.
202 * no_resume The server does not support resuming the
203 (HTTP or RTMP) download. Boolean.
204 * downloader_options A dictionary of downloader options as
205 described in FileDownloader
206 RTMP formats can also have the additional fields: page_url,
207 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
208 rtmp_protocol, rtmp_real_time
209
210 url: Final video URL.
211 ext: Video filename extension.
212 format: The video format, defaults to ext (used for --get-format)
213 player_url: SWF Player URL (used for rtmpdump).
214
215 The following fields are optional:
216
217 alt_title: A secondary title of the video.
218 display_id An alternative identifier for the video, not necessarily
219 unique, but available before title. Typically, id is
220 something like "4234987", title "Dancing naked mole rats",
221 and display_id "dancing-naked-mole-rats"
222 thumbnails: A list of dictionaries, with the following entries:
223 * "id" (optional, string) - Thumbnail format ID
224 * "url"
225 * "preference" (optional, int) - quality of the image
226 * "width" (optional, int)
227 * "height" (optional, int)
228 * "resolution" (optional, string "{width}x{height}",
229 deprecated)
230 * "filesize" (optional, int)
231 thumbnail: Full URL to a video thumbnail image.
232 description: Full video description.
233 uploader: Full name of the video uploader.
234 license: License name the video is licensed under.
235 creator: The creator of the video.
236 release_timestamp: UNIX timestamp of the moment the video was released.
237 release_date: The date (YYYYMMDD) when the video was released.
238 timestamp: UNIX timestamp of the moment the video was uploaded
239 upload_date: Video upload date (YYYYMMDD).
240 If not explicitly set, calculated from timestamp.
241 uploader_id: Nickname or id of the video uploader.
242 uploader_url: Full URL to a personal webpage of the video uploader.
243 channel: Full name of the channel the video is uploaded on.
244 Note that channel fields may or may not repeat uploader
245 fields. This depends on a particular extractor.
246 channel_id: Id of the channel.
247 channel_url: Full URL to a channel webpage.
248 location: Physical location where the video was filmed.
249 subtitles: The available subtitles as a dictionary in the format
250 {tag: subformats}. "tag" is usually a language code, and
251 "subformats" is a list sorted from lower to higher
252 preference, each element is a dictionary with the "ext"
253 entry and one of:
254 * "data": The subtitles file contents
255 * "url": A URL pointing to the subtitles file
256 It can optionally also have:
257 * "name": Name or description of the subtitles
258 "ext" will be calculated from URL if missing
259 automatic_captions: Like 'subtitles'; contains automatically generated
260 captions instead of normal subtitles
261 duration: Length of the video in seconds, as an integer or float.
262 view_count: How many users have watched the video on the platform.
263 like_count: Number of positive ratings of the video
264 dislike_count: Number of negative ratings of the video
265 repost_count: Number of reposts of the video
266 average_rating: Average rating give by users, the scale used depends on the webpage
267 comment_count: Number of comments on the video
268 comments: A list of comments, each with one or more of the following
269 properties (all but one of text or html optional):
270 * "author" - human-readable name of the comment author
271 * "author_id" - user ID of the comment author
272 * "author_thumbnail" - The thumbnail of the comment author
273 * "id" - Comment ID
274 * "html" - Comment as HTML
275 * "text" - Plain text of the comment
276 * "timestamp" - UNIX timestamp of comment
277 * "parent" - ID of the comment this one is replying to.
278 Set to "root" to indicate that this is a
279 comment to the original video.
280 * "like_count" - Number of positive ratings of the comment
281 * "dislike_count" - Number of negative ratings of the comment
282 * "is_favorited" - Whether the comment is marked as
283 favorite by the video uploader
284 * "author_is_uploader" - Whether the comment is made by
285 the video uploader
286 age_limit: Age restriction for the video, as an integer (years)
287 webpage_url: The URL to the video webpage, if given to yt-dlp it
288 should allow to get the same result again. (It will be set
289 by YoutubeDL if it's missing)
290 categories: A list of categories that the video falls in, for example
291 ["Sports", "Berlin"]
292 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
293 is_live: True, False, or None (=unknown). Whether this video is a
294 live stream that goes on instead of a fixed-length video.
295 was_live: True, False, or None (=unknown). Whether this video was
296 originally a live stream.
297 start_time: Time in seconds where the reproduction should start, as
298 specified in the URL.
299 end_time: Time in seconds where the reproduction should end, as
300 specified in the URL.
301 chapters: A list of dictionaries, with the following entries:
302 * "start_time" - The start time of the chapter in seconds
303 * "end_time" - The end time of the chapter in seconds
304 * "title" (optional, string)
305 playable_in_embed: Whether this video is allowed to play in embedded
306 players on other sites. Can be True (=always allowed),
307 False (=never allowed), None (=unknown), or a string
308 specifying the criteria for embedability (Eg: 'whitelist')
309 availability: Under what condition the video is available. One of
310 'private', 'premium_only', 'subscriber_only', 'needs_auth',
311 'unlisted' or 'public'. Use 'InfoExtractor._availability'
312 to set it
313 __post_extractor: A function to be called just before the metadata is
314 written to either disk, logger or console. The function
315 must return a dict which will be added to the info_dict.
316 This is usefull for additional information that is
317 time-consuming to extract. Note that the fields thus
318 extracted will not be available to output template and
319 match_filter. So, only "comments" and "comment_count" are
320 currently allowed to be extracted via this method.
321
322 The following fields should only be used when the video belongs to some logical
323 chapter or section:
324
325 chapter: Name or title of the chapter the video belongs to.
326 chapter_number: Number of the chapter the video belongs to, as an integer.
327 chapter_id: Id of the chapter the video belongs to, as a unicode string.
328
329 The following fields should only be used when the video is an episode of some
330 series, programme or podcast:
331
332 series: Title of the series or programme the video episode belongs to.
333 season: Title of the season the video episode belongs to.
334 season_number: Number of the season the video episode belongs to, as an integer.
335 season_id: Id of the season the video episode belongs to, as a unicode string.
336 episode: Title of the video episode. Unlike mandatory video title field,
337 this field should denote the exact title of the video episode
338 without any kind of decoration.
339 episode_number: Number of the video episode within a season, as an integer.
340 episode_id: Id of the video episode, as a unicode string.
341
342 The following fields should only be used when the media is a track or a part of
343 a music album:
344
345 track: Title of the track.
346 track_number: Number of the track within an album or a disc, as an integer.
347 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
348 as a unicode string.
349 artist: Artist(s) of the track.
350 genre: Genre(s) of the track.
351 album: Title of the album the track belongs to.
352 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
353 album_artist: List of all artists appeared on the album (e.g.
354 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
355 and compilations).
356 disc_number: Number of the disc or other physical medium the track belongs to,
357 as an integer.
358 release_year: Year (YYYY) when the album was released.
359
360 Unless mentioned otherwise, the fields should be Unicode strings.
361
362 Unless mentioned otherwise, None is equivalent to absence of information.
363
364
365 _type "playlist" indicates multiple videos.
366 There must be a key "entries", which is a list, an iterable, or a PagedList
367 object, each element of which is a valid dictionary by this specification.
368
369 Additionally, playlists can have "id", "title", and any other relevent
370 attributes with the same semantics as videos (see above).
371
372
373 _type "multi_video" indicates that there are multiple videos that
374 form a single show, for examples multiple acts of an opera or TV episode.
375 It must have an entries key like a playlist and contain all the keys
376 required for a video at the same time.
377
378
379 _type "url" indicates that the video must be extracted from another
380 location, possibly by a different extractor. Its only required key is:
381 "url" - the next URL to extract.
382 The key "ie_key" can be set to the class name (minus the trailing "IE",
383 e.g. "Youtube") if the extractor class is known in advance.
384 Additionally, the dictionary may have any properties of the resolved entity
385 known in advance, for example "title" if the title of the referred video is
386 known ahead of time.
387
388
389 _type "url_transparent" entities have the same specification as "url", but
390 indicate that the given additional information is more precise than the one
391 associated with the resolved URL.
392 This is useful when a site employs a video service that hosts the video and
393 its technical metadata, but that video service does not embed a useful
394 title, description etc.
395
396
397 Subclasses of this one should re-define the _real_initialize() and
398 _real_extract() methods and define a _VALID_URL regexp.
399 Probably, they should also be added to the list of extractors.
400
401 _GEO_BYPASS attribute may be set to False in order to disable
402 geo restriction bypass mechanisms for a particular extractor.
403 Though it won't disable explicit geo restriction bypass based on
404 country code provided with geo_bypass_country.
405
406 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
407 countries for this extractor. One of these countries will be used by
408 geo restriction bypass mechanism right away in order to bypass
409 geo restriction, of course, if the mechanism is not disabled.
410
411 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
412 IP blocks in CIDR notation for this extractor. One of these IP blocks
413 will be used by geo restriction bypass mechanism similarly
414 to _GEO_COUNTRIES.
415
416 Finally, the _WORKING attribute should be set to False for broken IEs
417 in order to warn the users and skip the tests.
418 """
419
420 _ready = False
421 _downloader = None
422 _x_forwarded_for_ip = None
423 _GEO_BYPASS = True
424 _GEO_COUNTRIES = None
425 _GEO_IP_BLOCKS = None
426 _WORKING = True
427
428 _LOGIN_HINTS = {
429 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
430 'cookies': (
431 'Use --cookies for the authentication. '
432 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
433 'password': 'Use --username and --password or --netrc to provide account credentials',
434 }
435
436 def __init__(self, downloader=None):
437 """Constructor. Receives an optional downloader."""
438 self._ready = False
439 self._x_forwarded_for_ip = None
440 self.set_downloader(downloader)
441
442 @classmethod
443 def suitable(cls, url):
444 """Receives a URL and returns True if suitable for this IE."""
445
446 # This does not use has/getattr intentionally - we want to know whether
447 # we have cached the regexp for *this* class, whereas getattr would also
448 # match the superclass
449 if '_VALID_URL_RE' not in cls.__dict__:
450 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
451 return cls._VALID_URL_RE.match(url) is not None
452
453 @classmethod
454 def _match_id(cls, url):
455 if '_VALID_URL_RE' not in cls.__dict__:
456 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
457 m = cls._VALID_URL_RE.match(url)
458 assert m
459 return compat_str(m.group('id'))
460
461 @classmethod
462 def working(cls):
463 """Getter method for _WORKING."""
464 return cls._WORKING
465
466 def initialize(self):
467 """Initializes an instance (authentication, etc)."""
468 self._initialize_geo_bypass({
469 'countries': self._GEO_COUNTRIES,
470 'ip_blocks': self._GEO_IP_BLOCKS,
471 })
472 if not self._ready:
473 self._real_initialize()
474 self._ready = True
475
476 def _initialize_geo_bypass(self, geo_bypass_context):
477 """
478 Initialize geo restriction bypass mechanism.
479
480 This method is used to initialize geo bypass mechanism based on faking
481 X-Forwarded-For HTTP header. A random country from provided country list
482 is selected and a random IP belonging to this country is generated. This
483 IP will be passed as X-Forwarded-For HTTP header in all subsequent
484 HTTP requests.
485
486 This method will be used for initial geo bypass mechanism initialization
487 during the instance initialization with _GEO_COUNTRIES and
488 _GEO_IP_BLOCKS.
489
490 You may also manually call it from extractor's code if geo bypass
491 information is not available beforehand (e.g. obtained during
492 extraction) or due to some other reason. In this case you should pass
493 this information in geo bypass context passed as first argument. It may
494 contain following fields:
495
496 countries: List of geo unrestricted countries (similar
497 to _GEO_COUNTRIES)
498 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
499 (similar to _GEO_IP_BLOCKS)
500
501 """
502 if not self._x_forwarded_for_ip:
503
504 # Geo bypass mechanism is explicitly disabled by user
505 if not self.get_param('geo_bypass', True):
506 return
507
508 if not geo_bypass_context:
509 geo_bypass_context = {}
510
511 # Backward compatibility: previously _initialize_geo_bypass
512 # expected a list of countries, some 3rd party code may still use
513 # it this way
514 if isinstance(geo_bypass_context, (list, tuple)):
515 geo_bypass_context = {
516 'countries': geo_bypass_context,
517 }
518
519 # The whole point of geo bypass mechanism is to fake IP
520 # as X-Forwarded-For HTTP header based on some IP block or
521 # country code.
522
523 # Path 1: bypassing based on IP block in CIDR notation
524
525 # Explicit IP block specified by user, use it right away
526 # regardless of whether extractor is geo bypassable or not
527 ip_block = self.get_param('geo_bypass_ip_block', None)
528
529 # Otherwise use random IP block from geo bypass context but only
530 # if extractor is known as geo bypassable
531 if not ip_block:
532 ip_blocks = geo_bypass_context.get('ip_blocks')
533 if self._GEO_BYPASS and ip_blocks:
534 ip_block = random.choice(ip_blocks)
535
536 if ip_block:
537 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
538 self._downloader.write_debug(
539 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
540 return
541
542 # Path 2: bypassing based on country code
543
544 # Explicit country code specified by user, use it right away
545 # regardless of whether extractor is geo bypassable or not
546 country = self.get_param('geo_bypass_country', None)
547
548 # Otherwise use random country code from geo bypass context but
549 # only if extractor is known as geo bypassable
550 if not country:
551 countries = geo_bypass_context.get('countries')
552 if self._GEO_BYPASS and countries:
553 country = random.choice(countries)
554
555 if country:
556 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
557 self._downloader.write_debug(
558 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
559
560 def extract(self, url):
561 """Extracts URL information and returns it in list of dicts."""
562 try:
563 for _ in range(2):
564 try:
565 self.initialize()
566 self.write_debug('Extracting URL: %s' % url)
567 ie_result = self._real_extract(url)
568 if ie_result is None:
569 return None
570 if self._x_forwarded_for_ip:
571 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
572 subtitles = ie_result.get('subtitles')
573 if (subtitles and 'live_chat' in subtitles
574 and 'no-live-chat' in self.get_param('compat_opts', [])):
575 del subtitles['live_chat']
576 return ie_result
577 except GeoRestrictedError as e:
578 if self.__maybe_fake_ip_and_retry(e.countries):
579 continue
580 raise
581 except ExtractorError:
582 raise
583 except compat_http_client.IncompleteRead as e:
584 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
585 except (KeyError, StopIteration) as e:
586 raise ExtractorError('An extractor error has occurred.', cause=e)
587
588 def __maybe_fake_ip_and_retry(self, countries):
589 if (not self.get_param('geo_bypass_country', None)
590 and self._GEO_BYPASS
591 and self.get_param('geo_bypass', True)
592 and not self._x_forwarded_for_ip
593 and countries):
594 country_code = random.choice(countries)
595 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
596 if self._x_forwarded_for_ip:
597 self.report_warning(
598 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
599 % (self._x_forwarded_for_ip, country_code.upper()))
600 return True
601 return False
602
603 def set_downloader(self, downloader):
604 """Sets the downloader for this IE."""
605 self._downloader = downloader
606
607 def _real_initialize(self):
608 """Real initialization process. Redefine in subclasses."""
609 pass
610
611 def _real_extract(self, url):
612 """Real extraction process. Redefine in subclasses."""
613 pass
614
615 @classmethod
616 def ie_key(cls):
617 """A string for getting the InfoExtractor with get_info_extractor"""
618 return compat_str(cls.__name__[:-2])
619
620 @property
621 def IE_NAME(self):
622 return compat_str(type(self).__name__[:-2])
623
624 @staticmethod
625 def __can_accept_status_code(err, expected_status):
626 assert isinstance(err, compat_urllib_error.HTTPError)
627 if expected_status is None:
628 return False
629 if isinstance(expected_status, compat_integer_types):
630 return err.code == expected_status
631 elif isinstance(expected_status, (list, tuple)):
632 return err.code in expected_status
633 elif callable(expected_status):
634 return expected_status(err.code) is True
635 else:
636 assert False
637
638 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
639 """
640 Return the response handle.
641
642 See _download_webpage docstring for arguments specification.
643 """
644 if not self._downloader._first_webpage_request:
645 sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
646 if sleep_interval > 0:
647 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
648 time.sleep(sleep_interval)
649 else:
650 self._downloader._first_webpage_request = False
651
652 if note is None:
653 self.report_download_webpage(video_id)
654 elif note is not False:
655 if video_id is None:
656 self.to_screen('%s' % (note,))
657 else:
658 self.to_screen('%s: %s' % (video_id, note))
659
660 # Some sites check X-Forwarded-For HTTP header in order to figure out
661 # the origin of the client behind proxy. This allows bypassing geo
662 # restriction by faking this header's value to IP that belongs to some
663 # geo unrestricted country. We will do so once we encounter any
664 # geo restriction error.
665 if self._x_forwarded_for_ip:
666 if 'X-Forwarded-For' not in headers:
667 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
668
669 if isinstance(url_or_request, compat_urllib_request.Request):
670 url_or_request = update_Request(
671 url_or_request, data=data, headers=headers, query=query)
672 else:
673 if query:
674 url_or_request = update_url_query(url_or_request, query)
675 if data is not None or headers:
676 url_or_request = sanitized_Request(url_or_request, data, headers)
677 try:
678 return self._downloader.urlopen(url_or_request)
679 except network_exceptions as err:
680 if isinstance(err, compat_urllib_error.HTTPError):
681 if self.__can_accept_status_code(err, expected_status):
682 # Retain reference to error to prevent file object from
683 # being closed before it can be read. Works around the
684 # effects of <https://bugs.python.org/issue15002>
685 # introduced in Python 3.4.1.
686 err.fp._error = err
687 return err.fp
688
689 if errnote is False:
690 return False
691 if errnote is None:
692 errnote = 'Unable to download webpage'
693
694 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
695 if fatal:
696 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
697 else:
698 self.report_warning(errmsg)
699 return False
700
701 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
702 """
703 Return a tuple (page content as string, URL handle).
704
705 See _download_webpage docstring for arguments specification.
706 """
707 # Strip hashes from the URL (#1038)
708 if isinstance(url_or_request, (compat_str, str)):
709 url_or_request = url_or_request.partition('#')[0]
710
711 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
712 if urlh is False:
713 assert not fatal
714 return False
715 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
716 return (content, urlh)
717
718 @staticmethod
719 def _guess_encoding_from_content(content_type, webpage_bytes):
720 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
721 if m:
722 encoding = m.group(1)
723 else:
724 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
725 webpage_bytes[:1024])
726 if m:
727 encoding = m.group(1).decode('ascii')
728 elif webpage_bytes.startswith(b'\xff\xfe'):
729 encoding = 'utf-16'
730 else:
731 encoding = 'utf-8'
732
733 return encoding
734
735 def __check_blocked(self, content):
736 first_block = content[:512]
737 if ('<title>Access to this site is blocked</title>' in content
738 and 'Websense' in first_block):
739 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
740 blocked_iframe = self._html_search_regex(
741 r'<iframe src="([^"]+)"', content,
742 'Websense information URL', default=None)
743 if blocked_iframe:
744 msg += ' Visit %s for more details' % blocked_iframe
745 raise ExtractorError(msg, expected=True)
746 if '<title>The URL you requested has been blocked</title>' in first_block:
747 msg = (
748 'Access to this webpage has been blocked by Indian censorship. '
749 'Use a VPN or proxy server (with --proxy) to route around it.')
750 block_msg = self._html_search_regex(
751 r'</h1><p>(.*?)</p>',
752 content, 'block message', default=None)
753 if block_msg:
754 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
755 raise ExtractorError(msg, expected=True)
756 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
757 and 'blocklist.rkn.gov.ru' in content):
758 raise ExtractorError(
759 'Access to this webpage has been blocked by decision of the Russian government. '
760 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
761 expected=True)
762
763 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
764 content_type = urlh.headers.get('Content-Type', '')
765 webpage_bytes = urlh.read()
766 if prefix is not None:
767 webpage_bytes = prefix + webpage_bytes
768 if not encoding:
769 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
770 if self.get_param('dump_intermediate_pages', False):
771 self.to_screen('Dumping request to ' + urlh.geturl())
772 dump = base64.b64encode(webpage_bytes).decode('ascii')
773 self._downloader.to_screen(dump)
774 if self.get_param('write_pages', False):
775 basen = '%s_%s' % (video_id, urlh.geturl())
776 if len(basen) > 240:
777 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
778 basen = basen[:240 - len(h)] + h
779 raw_filename = basen + '.dump'
780 filename = sanitize_filename(raw_filename, restricted=True)
781 self.to_screen('Saving request to ' + filename)
782 # Working around MAX_PATH limitation on Windows (see
783 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
784 if compat_os_name == 'nt':
785 absfilepath = os.path.abspath(filename)
786 if len(absfilepath) > 259:
787 filename = '\\\\?\\' + absfilepath
788 with open(filename, 'wb') as outf:
789 outf.write(webpage_bytes)
790
791 try:
792 content = webpage_bytes.decode(encoding, 'replace')
793 except LookupError:
794 content = webpage_bytes.decode('utf-8', 'replace')
795
796 self.__check_blocked(content)
797
798 return content
799
800 def _download_webpage(
801 self, url_or_request, video_id, note=None, errnote=None,
802 fatal=True, tries=1, timeout=5, encoding=None, data=None,
803 headers={}, query={}, expected_status=None):
804 """
805 Return the data of the page as a string.
806
807 Arguments:
808 url_or_request -- plain text URL as a string or
809 a compat_urllib_request.Requestobject
810 video_id -- Video/playlist/item identifier (string)
811
812 Keyword arguments:
813 note -- note printed before downloading (string)
814 errnote -- note printed in case of an error (string)
815 fatal -- flag denoting whether error should be considered fatal,
816 i.e. whether it should cause ExtractionError to be raised,
817 otherwise a warning will be reported and extraction continued
818 tries -- number of tries
819 timeout -- sleep interval between tries
820 encoding -- encoding for a page content decoding, guessed automatically
821 when not explicitly specified
822 data -- POST data (bytes)
823 headers -- HTTP headers (dict)
824 query -- URL query (dict)
825 expected_status -- allows to accept failed HTTP requests (non 2xx
826 status code) by explicitly specifying a set of accepted status
827 codes. Can be any of the following entities:
828 - an integer type specifying an exact failed status code to
829 accept
830 - a list or a tuple of integer types specifying a list of
831 failed status codes to accept
832 - a callable accepting an actual failed status code and
833 returning True if it should be accepted
834 Note that this argument does not affect success status codes (2xx)
835 which are always accepted.
836 """
837
838 success = False
839 try_count = 0
840 while success is False:
841 try:
842 res = self._download_webpage_handle(
843 url_or_request, video_id, note, errnote, fatal,
844 encoding=encoding, data=data, headers=headers, query=query,
845 expected_status=expected_status)
846 success = True
847 except compat_http_client.IncompleteRead as e:
848 try_count += 1
849 if try_count >= tries:
850 raise e
851 self._sleep(timeout, video_id)
852 if res is False:
853 return res
854 else:
855 content, _ = res
856 return content
857
858 def _download_xml_handle(
859 self, url_or_request, video_id, note='Downloading XML',
860 errnote='Unable to download XML', transform_source=None,
861 fatal=True, encoding=None, data=None, headers={}, query={},
862 expected_status=None):
863 """
864 Return a tuple (xml as an compat_etree_Element, URL handle).
865
866 See _download_webpage docstring for arguments specification.
867 """
868 res = self._download_webpage_handle(
869 url_or_request, video_id, note, errnote, fatal=fatal,
870 encoding=encoding, data=data, headers=headers, query=query,
871 expected_status=expected_status)
872 if res is False:
873 return res
874 xml_string, urlh = res
875 return self._parse_xml(
876 xml_string, video_id, transform_source=transform_source,
877 fatal=fatal), urlh
878
879 def _download_xml(
880 self, url_or_request, video_id,
881 note='Downloading XML', errnote='Unable to download XML',
882 transform_source=None, fatal=True, encoding=None,
883 data=None, headers={}, query={}, expected_status=None):
884 """
885 Return the xml as an compat_etree_Element.
886
887 See _download_webpage docstring for arguments specification.
888 """
889 res = self._download_xml_handle(
890 url_or_request, video_id, note=note, errnote=errnote,
891 transform_source=transform_source, fatal=fatal, encoding=encoding,
892 data=data, headers=headers, query=query,
893 expected_status=expected_status)
894 return res if res is False else res[0]
895
896 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
897 if transform_source:
898 xml_string = transform_source(xml_string)
899 try:
900 return compat_etree_fromstring(xml_string.encode('utf-8'))
901 except compat_xml_parse_error as ve:
902 errmsg = '%s: Failed to parse XML ' % video_id
903 if fatal:
904 raise ExtractorError(errmsg, cause=ve)
905 else:
906 self.report_warning(errmsg + str(ve))
907
908 def _download_json_handle(
909 self, url_or_request, video_id, note='Downloading JSON metadata',
910 errnote='Unable to download JSON metadata', transform_source=None,
911 fatal=True, encoding=None, data=None, headers={}, query={},
912 expected_status=None):
913 """
914 Return a tuple (JSON object, URL handle).
915
916 See _download_webpage docstring for arguments specification.
917 """
918 res = self._download_webpage_handle(
919 url_or_request, video_id, note, errnote, fatal=fatal,
920 encoding=encoding, data=data, headers=headers, query=query,
921 expected_status=expected_status)
922 if res is False:
923 return res
924 json_string, urlh = res
925 return self._parse_json(
926 json_string, video_id, transform_source=transform_source,
927 fatal=fatal), urlh
928
929 def _download_json(
930 self, url_or_request, video_id, note='Downloading JSON metadata',
931 errnote='Unable to download JSON metadata', transform_source=None,
932 fatal=True, encoding=None, data=None, headers={}, query={},
933 expected_status=None):
934 """
935 Return the JSON object as a dict.
936
937 See _download_webpage docstring for arguments specification.
938 """
939 res = self._download_json_handle(
940 url_or_request, video_id, note=note, errnote=errnote,
941 transform_source=transform_source, fatal=fatal, encoding=encoding,
942 data=data, headers=headers, query=query,
943 expected_status=expected_status)
944 return res if res is False else res[0]
945
946 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
947 if transform_source:
948 json_string = transform_source(json_string)
949 try:
950 return json.loads(json_string)
951 except ValueError as ve:
952 errmsg = '%s: Failed to parse JSON ' % video_id
953 if fatal:
954 raise ExtractorError(errmsg, cause=ve)
955 else:
956 self.report_warning(errmsg + str(ve))
957
958 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
959 return self._parse_json(
960 data[data.find('{'):data.rfind('}') + 1],
961 video_id, transform_source, fatal)
962
963 def _download_socket_json_handle(
964 self, url_or_request, video_id, note='Polling socket',
965 errnote='Unable to poll socket', transform_source=None,
966 fatal=True, encoding=None, data=None, headers={}, query={},
967 expected_status=None):
968 """
969 Return a tuple (JSON object, URL handle).
970
971 See _download_webpage docstring for arguments specification.
972 """
973 res = self._download_webpage_handle(
974 url_or_request, video_id, note, errnote, fatal=fatal,
975 encoding=encoding, data=data, headers=headers, query=query,
976 expected_status=expected_status)
977 if res is False:
978 return res
979 webpage, urlh = res
980 return self._parse_socket_response_as_json(
981 webpage, video_id, transform_source=transform_source,
982 fatal=fatal), urlh
983
984 def _download_socket_json(
985 self, url_or_request, video_id, note='Polling socket',
986 errnote='Unable to poll socket', transform_source=None,
987 fatal=True, encoding=None, data=None, headers={}, query={},
988 expected_status=None):
989 """
990 Return the JSON object as a dict.
991
992 See _download_webpage docstring for arguments specification.
993 """
994 res = self._download_socket_json_handle(
995 url_or_request, video_id, note=note, errnote=errnote,
996 transform_source=transform_source, fatal=fatal, encoding=encoding,
997 data=data, headers=headers, query=query,
998 expected_status=expected_status)
999 return res if res is False else res[0]
1000
1001 def report_warning(self, msg, video_id=None, *args, **kwargs):
1002 idstr = '' if video_id is None else '%s: ' % video_id
1003 self._downloader.report_warning(
1004 '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1005
1006 def to_screen(self, msg, *args, **kwargs):
1007 """Print msg to screen, prefixing it with '[ie_name]'"""
1008 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1009
1010 def write_debug(self, msg, *args, **kwargs):
1011 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1012
1013 def get_param(self, name, default=None, *args, **kwargs):
1014 if self._downloader:
1015 return self._downloader.params.get(name, default, *args, **kwargs)
1016 return default
1017
1018 def report_extraction(self, id_or_name):
1019 """Report information extraction."""
1020 self.to_screen('%s: Extracting information' % id_or_name)
1021
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self.to_screen('%s: Downloading webpage' % video_id)
1025
1026 def report_age_confirmation(self):
1027 """Report attempt to confirm age."""
1028 self.to_screen('Confirming age')
1029
1030 def report_login(self):
1031 """Report attempt to log in."""
1032 self.to_screen('Logging in')
1033
1034 def raise_login_required(
1035 self, msg='This video is only available for registered users',
1036 metadata_available=False, method='any'):
1037 if metadata_available and self.get_param('ignore_no_formats_error'):
1038 self.report_warning(msg)
1039 raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
1040
1041 def raise_geo_restricted(
1042 self, msg='This video is not available from your location due to geo restriction',
1043 countries=None, metadata_available=False):
1044 if metadata_available and self.get_param('ignore_no_formats_error'):
1045 self.report_warning(msg)
1046 else:
1047 raise GeoRestrictedError(msg, countries=countries)
1048
1049 def raise_no_formats(self, msg, expected=False, video_id=None):
1050 if expected and self.get_param('ignore_no_formats_error'):
1051 self.report_warning(msg, video_id)
1052 else:
1053 raise ExtractorError(msg, expected=expected, video_id=video_id)
1054
1055 # Methods for following #608
1056 @staticmethod
1057 def url_result(url, ie=None, video_id=None, video_title=None):
1058 """Returns a URL that points to a page that should be processed"""
1059 # TODO: ie should be the class used for getting the info
1060 video_info = {'_type': 'url',
1061 'url': url,
1062 'ie_key': ie}
1063 if video_id is not None:
1064 video_info['id'] = video_id
1065 if video_title is not None:
1066 video_info['title'] = video_title
1067 return video_info
1068
1069 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1070 urls = orderedSet(
1071 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1072 for m in matches)
1073 return self.playlist_result(
1074 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1075
1076 @staticmethod
1077 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1078 """Returns a playlist"""
1079 video_info = {'_type': 'playlist',
1080 'entries': entries}
1081 video_info.update(kwargs)
1082 if playlist_id:
1083 video_info['id'] = playlist_id
1084 if playlist_title:
1085 video_info['title'] = playlist_title
1086 if playlist_description is not None:
1087 video_info['description'] = playlist_description
1088 return video_info
1089
1090 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1091 """
1092 Perform a regex search on the given string, using a single or a list of
1093 patterns returning the first matching group.
1094 In case of failure return a default value or raise a WARNING or a
1095 RegexNotFoundError, depending on fatal, specifying the field name.
1096 """
1097 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1098 mobj = re.search(pattern, string, flags)
1099 else:
1100 for p in pattern:
1101 mobj = re.search(p, string, flags)
1102 if mobj:
1103 break
1104
1105 if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1106 _name = '\033[0;34m%s\033[0m' % name
1107 else:
1108 _name = name
1109
1110 if mobj:
1111 if group is None:
1112 # return the first matching group
1113 return next(g for g in mobj.groups() if g is not None)
1114 else:
1115 return mobj.group(group)
1116 elif default is not NO_DEFAULT:
1117 return default
1118 elif fatal:
1119 raise RegexNotFoundError('Unable to extract %s' % _name)
1120 else:
1121 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1122 return None
1123
1124 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1125 """
1126 Like _search_regex, but strips HTML tags and unescapes entities.
1127 """
1128 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1129 if res:
1130 return clean_html(res).strip()
1131 else:
1132 return res
1133
1134 def _get_netrc_login_info(self, netrc_machine=None):
1135 username = None
1136 password = None
1137 netrc_machine = netrc_machine or self._NETRC_MACHINE
1138
1139 if self.get_param('usenetrc', False):
1140 try:
1141 info = netrc.netrc().authenticators(netrc_machine)
1142 if info is not None:
1143 username = info[0]
1144 password = info[2]
1145 else:
1146 raise netrc.NetrcParseError(
1147 'No authenticators for %s' % netrc_machine)
1148 except (IOError, netrc.NetrcParseError) as err:
1149 self.report_warning(
1150 'parsing .netrc: %s' % error_to_compat_str(err))
1151
1152 return username, password
1153
1154 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1155 """
1156 Get the login info as (username, password)
1157 First look for the manually specified credentials using username_option
1158 and password_option as keys in params dictionary. If no such credentials
1159 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1160 value.
1161 If there's no info available, return (None, None)
1162 """
1163
1164 # Attempt to use provided username and password or .netrc data
1165 username = self.get_param(username_option)
1166 if username is not None:
1167 password = self.get_param(password_option)
1168 else:
1169 username, password = self._get_netrc_login_info(netrc_machine)
1170
1171 return username, password
1172
1173 def _get_tfa_info(self, note='two-factor verification code'):
1174 """
1175 Get the two-factor authentication info
1176 TODO - asking the user will be required for sms/phone verify
1177 currently just uses the command line option
1178 If there's no info available, return None
1179 """
1180
1181 tfa = self.get_param('twofactor')
1182 if tfa is not None:
1183 return tfa
1184
1185 return compat_getpass('Type %s and press [Return]: ' % note)
1186
1187 # Helper functions for extracting OpenGraph info
1188 @staticmethod
1189 def _og_regexes(prop):
1190 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1191 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1192 % {'prop': re.escape(prop)})
1193 template = r'<meta[^>]+?%s[^>]+?%s'
1194 return [
1195 template % (property_re, content_re),
1196 template % (content_re, property_re),
1197 ]
1198
1199 @staticmethod
1200 def _meta_regex(prop):
1201 return r'''(?isx)<meta
1202 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1203 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1204
1205 def _og_search_property(self, prop, html, name=None, **kargs):
1206 if not isinstance(prop, (list, tuple)):
1207 prop = [prop]
1208 if name is None:
1209 name = 'OpenGraph %s' % prop[0]
1210 og_regexes = []
1211 for p in prop:
1212 og_regexes.extend(self._og_regexes(p))
1213 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1214 if escaped is None:
1215 return None
1216 return unescapeHTML(escaped)
1217
1218 def _og_search_thumbnail(self, html, **kargs):
1219 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1220
1221 def _og_search_description(self, html, **kargs):
1222 return self._og_search_property('description', html, fatal=False, **kargs)
1223
1224 def _og_search_title(self, html, **kargs):
1225 return self._og_search_property('title', html, **kargs)
1226
1227 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1228 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1229 if secure:
1230 regexes = self._og_regexes('video:secure_url') + regexes
1231 return self._html_search_regex(regexes, html, name, **kargs)
1232
1233 def _og_search_url(self, html, **kargs):
1234 return self._og_search_property('url', html, **kargs)
1235
1236 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1237 if not isinstance(name, (list, tuple)):
1238 name = [name]
1239 if display_name is None:
1240 display_name = name[0]
1241 return self._html_search_regex(
1242 [self._meta_regex(n) for n in name],
1243 html, display_name, fatal=fatal, group='content', **kwargs)
1244
1245 def _dc_search_uploader(self, html):
1246 return self._html_search_meta('dc.creator', html, 'uploader')
1247
1248 def _rta_search(self, html):
1249 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1250 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1251 r' content="RTA-5042-1996-1400-1577-RTA"',
1252 html):
1253 return 18
1254 return 0
1255
1256 def _media_rating_search(self, html):
1257 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1258 rating = self._html_search_meta('rating', html)
1259
1260 if not rating:
1261 return None
1262
1263 RATING_TABLE = {
1264 'safe for kids': 0,
1265 'general': 8,
1266 '14 years': 14,
1267 'mature': 17,
1268 'restricted': 19,
1269 }
1270 return RATING_TABLE.get(rating.lower())
1271
1272 def _family_friendly_search(self, html):
1273 # See http://schema.org/VideoObject
1274 family_friendly = self._html_search_meta(
1275 'isFamilyFriendly', html, default=None)
1276
1277 if not family_friendly:
1278 return None
1279
1280 RATING_TABLE = {
1281 '1': 0,
1282 'true': 0,
1283 '0': 18,
1284 'false': 18,
1285 }
1286 return RATING_TABLE.get(family_friendly.lower())
1287
1288 def _twitter_search_player(self, html):
1289 return self._html_search_meta('twitter:player', html,
1290 'twitter card player')
1291
1292 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1293 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1294 default = kwargs.get('default', NO_DEFAULT)
1295 # JSON-LD may be malformed and thus `fatal` should be respected.
1296 # At the same time `default` may be passed that assumes `fatal=False`
1297 # for _search_regex. Let's simulate the same behavior here as well.
1298 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1299 json_ld = []
1300 for mobj in json_ld_list:
1301 json_ld_item = self._parse_json(
1302 mobj.group('json_ld'), video_id, fatal=fatal)
1303 if not json_ld_item:
1304 continue
1305 if isinstance(json_ld_item, dict):
1306 json_ld.append(json_ld_item)
1307 elif isinstance(json_ld_item, (list, tuple)):
1308 json_ld.extend(json_ld_item)
1309 if json_ld:
1310 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1311 if json_ld:
1312 return json_ld
1313 if default is not NO_DEFAULT:
1314 return default
1315 elif fatal:
1316 raise RegexNotFoundError('Unable to extract JSON-LD')
1317 else:
1318 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1319 return {}
1320
1321 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1322 if isinstance(json_ld, compat_str):
1323 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1324 if not json_ld:
1325 return {}
1326 info = {}
1327 if not isinstance(json_ld, (list, tuple, dict)):
1328 return info
1329 if isinstance(json_ld, dict):
1330 json_ld = [json_ld]
1331
1332 INTERACTION_TYPE_MAP = {
1333 'CommentAction': 'comment',
1334 'AgreeAction': 'like',
1335 'DisagreeAction': 'dislike',
1336 'LikeAction': 'like',
1337 'DislikeAction': 'dislike',
1338 'ListenAction': 'view',
1339 'WatchAction': 'view',
1340 'ViewAction': 'view',
1341 }
1342
1343 def extract_interaction_type(e):
1344 interaction_type = e.get('interactionType')
1345 if isinstance(interaction_type, dict):
1346 interaction_type = interaction_type.get('@type')
1347 return str_or_none(interaction_type)
1348
1349 def extract_interaction_statistic(e):
1350 interaction_statistic = e.get('interactionStatistic')
1351 if isinstance(interaction_statistic, dict):
1352 interaction_statistic = [interaction_statistic]
1353 if not isinstance(interaction_statistic, list):
1354 return
1355 for is_e in interaction_statistic:
1356 if not isinstance(is_e, dict):
1357 continue
1358 if is_e.get('@type') != 'InteractionCounter':
1359 continue
1360 interaction_type = extract_interaction_type(is_e)
1361 if not interaction_type:
1362 continue
1363 # For interaction count some sites provide string instead of
1364 # an integer (as per spec) with non digit characters (e.g. ",")
1365 # so extracting count with more relaxed str_to_int
1366 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1367 if interaction_count is None:
1368 continue
1369 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1370 if not count_kind:
1371 continue
1372 count_key = '%s_count' % count_kind
1373 if info.get(count_key) is not None:
1374 continue
1375 info[count_key] = interaction_count
1376
1377 def extract_video_object(e):
1378 assert e['@type'] == 'VideoObject'
1379 author = e.get('author')
1380 info.update({
1381 'url': url_or_none(e.get('contentUrl')),
1382 'title': unescapeHTML(e.get('name')),
1383 'description': unescapeHTML(e.get('description')),
1384 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1385 'duration': parse_duration(e.get('duration')),
1386 'timestamp': unified_timestamp(e.get('uploadDate')),
1387 # author can be an instance of 'Organization' or 'Person' types.
1388 # both types can have 'name' property(inherited from 'Thing' type). [1]
1389 # however some websites are using 'Text' type instead.
1390 # 1. https://schema.org/VideoObject
1391 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1392 'filesize': float_or_none(e.get('contentSize')),
1393 'tbr': int_or_none(e.get('bitrate')),
1394 'width': int_or_none(e.get('width')),
1395 'height': int_or_none(e.get('height')),
1396 'view_count': int_or_none(e.get('interactionCount')),
1397 })
1398 extract_interaction_statistic(e)
1399
1400 for e in json_ld:
1401 if '@context' in e:
1402 item_type = e.get('@type')
1403 if expected_type is not None and expected_type != item_type:
1404 continue
1405 if item_type in ('TVEpisode', 'Episode'):
1406 episode_name = unescapeHTML(e.get('name'))
1407 info.update({
1408 'episode': episode_name,
1409 'episode_number': int_or_none(e.get('episodeNumber')),
1410 'description': unescapeHTML(e.get('description')),
1411 })
1412 if not info.get('title') and episode_name:
1413 info['title'] = episode_name
1414 part_of_season = e.get('partOfSeason')
1415 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1416 info.update({
1417 'season': unescapeHTML(part_of_season.get('name')),
1418 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1419 })
1420 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1421 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1422 info['series'] = unescapeHTML(part_of_series.get('name'))
1423 elif item_type == 'Movie':
1424 info.update({
1425 'title': unescapeHTML(e.get('name')),
1426 'description': unescapeHTML(e.get('description')),
1427 'duration': parse_duration(e.get('duration')),
1428 'timestamp': unified_timestamp(e.get('dateCreated')),
1429 })
1430 elif item_type in ('Article', 'NewsArticle'):
1431 info.update({
1432 'timestamp': parse_iso8601(e.get('datePublished')),
1433 'title': unescapeHTML(e.get('headline')),
1434 'description': unescapeHTML(e.get('articleBody')),
1435 })
1436 elif item_type == 'VideoObject':
1437 extract_video_object(e)
1438 if expected_type is None:
1439 continue
1440 else:
1441 break
1442 video = e.get('video')
1443 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1444 extract_video_object(video)
1445 if expected_type is None:
1446 continue
1447 else:
1448 break
1449 return dict((k, v) for k, v in info.items() if v is not None)
1450
1451 @staticmethod
1452 def _hidden_inputs(html):
1453 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1454 hidden_inputs = {}
1455 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1456 attrs = extract_attributes(input)
1457 if not input:
1458 continue
1459 if attrs.get('type') not in ('hidden', 'submit'):
1460 continue
1461 name = attrs.get('name') or attrs.get('id')
1462 value = attrs.get('value')
1463 if name and value is not None:
1464 hidden_inputs[name] = value
1465 return hidden_inputs
1466
1467 def _form_hidden_inputs(self, form_id, html):
1468 form = self._search_regex(
1469 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1470 html, '%s form' % form_id, group='form')
1471 return self._hidden_inputs(form)
1472
1473 class FormatSort:
1474 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1475
1476 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1477 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1478 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
1479 ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1480 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1481 'fps', 'fs_approx', 'source', 'format_id')
1482
1483 settings = {
1484 'vcodec': {'type': 'ordered', 'regex': True,
1485 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1486 'acodec': {'type': 'ordered', 'regex': True,
1487 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1488 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1489 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1490 'vext': {'type': 'ordered', 'field': 'video_ext',
1491 'order': ('mp4', 'webm', 'flv', '', 'none'),
1492 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1493 'aext': {'type': 'ordered', 'field': 'audio_ext',
1494 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1495 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1496 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1497 'ie_pref': {'priority': True, 'type': 'extractor'},
1498 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1499 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1500 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1501 'quality': {'convert': 'float_none', 'default': -1},
1502 'filesize': {'convert': 'bytes'},
1503 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1504 'id': {'convert': 'string', 'field': 'format_id'},
1505 'height': {'convert': 'float_none'},
1506 'width': {'convert': 'float_none'},
1507 'fps': {'convert': 'float_none'},
1508 'tbr': {'convert': 'float_none'},
1509 'vbr': {'convert': 'float_none'},
1510 'abr': {'convert': 'float_none'},
1511 'asr': {'convert': 'float_none'},
1512 'source': {'convert': 'ignore', 'field': 'source_preference'},
1513
1514 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1515 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1516 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1517 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1518 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1519
1520 # Most of these exist only for compatibility reasons
1521 'dimension': {'type': 'alias', 'field': 'res'},
1522 'resolution': {'type': 'alias', 'field': 'res'},
1523 'extension': {'type': 'alias', 'field': 'ext'},
1524 'bitrate': {'type': 'alias', 'field': 'br'},
1525 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1526 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1527 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1528 'framerate': {'type': 'alias', 'field': 'fps'},
1529 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1530 'protocol': {'type': 'alias', 'field': 'proto'},
1531 'source_preference': {'type': 'alias', 'field': 'source'},
1532 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1533 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1534 'samplerate': {'type': 'alias', 'field': 'asr'},
1535 'video_ext': {'type': 'alias', 'field': 'vext'},
1536 'audio_ext': {'type': 'alias', 'field': 'aext'},
1537 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1538 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1539 'video': {'type': 'alias', 'field': 'hasvid'},
1540 'has_video': {'type': 'alias', 'field': 'hasvid'},
1541 'audio': {'type': 'alias', 'field': 'hasaud'},
1542 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1543 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1544 'preference': {'type': 'alias', 'field': 'ie_pref'},
1545 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1546 'format_id': {'type': 'alias', 'field': 'id'},
1547 }
1548
1549 _order = []
1550
1551 def _get_field_setting(self, field, key):
1552 if field not in self.settings:
1553 self.settings[field] = {}
1554 propObj = self.settings[field]
1555 if key not in propObj:
1556 type = propObj.get('type')
1557 if key == 'field':
1558 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1559 elif key == 'convert':
1560 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1561 else:
1562 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1563 propObj[key] = default
1564 return propObj[key]
1565
1566 def _resolve_field_value(self, field, value, convertNone=False):
1567 if value is None:
1568 if not convertNone:
1569 return None
1570 else:
1571 value = value.lower()
1572 conversion = self._get_field_setting(field, 'convert')
1573 if conversion == 'ignore':
1574 return None
1575 if conversion == 'string':
1576 return value
1577 elif conversion == 'float_none':
1578 return float_or_none(value)
1579 elif conversion == 'bytes':
1580 return FileDownloader.parse_bytes(value)
1581 elif conversion == 'order':
1582 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1583 use_regex = self._get_field_setting(field, 'regex')
1584 list_length = len(order_list)
1585 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1586 if use_regex and value is not None:
1587 for i, regex in enumerate(order_list):
1588 if regex and re.match(regex, value):
1589 return list_length - i
1590 return list_length - empty_pos # not in list
1591 else: # not regex or value = None
1592 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1593 else:
1594 if value.isnumeric():
1595 return float(value)
1596 else:
1597 self.settings[field]['convert'] = 'string'
1598 return value
1599
1600 def evaluate_params(self, params, sort_extractor):
1601 self._use_free_order = params.get('prefer_free_formats', False)
1602 self._sort_user = params.get('format_sort', [])
1603 self._sort_extractor = sort_extractor
1604
1605 def add_item(field, reverse, closest, limit_text):
1606 field = field.lower()
1607 if field in self._order:
1608 return
1609 self._order.append(field)
1610 limit = self._resolve_field_value(field, limit_text)
1611 data = {
1612 'reverse': reverse,
1613 'closest': False if limit is None else closest,
1614 'limit_text': limit_text,
1615 'limit': limit}
1616 if field in self.settings:
1617 self.settings[field].update(data)
1618 else:
1619 self.settings[field] = data
1620
1621 sort_list = (
1622 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1623 + (tuple() if params.get('format_sort_force', False)
1624 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1625 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1626
1627 for item in sort_list:
1628 match = re.match(self.regex, item)
1629 if match is None:
1630 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1631 field = match.group('field')
1632 if field is None:
1633 continue
1634 if self._get_field_setting(field, 'type') == 'alias':
1635 field = self._get_field_setting(field, 'field')
1636 reverse = match.group('reverse') is not None
1637 closest = match.group('separator') == '~'
1638 limit_text = match.group('limit')
1639
1640 has_limit = limit_text is not None
1641 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1642 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1643
1644 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1645 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1646 limit_count = len(limits)
1647 for (i, f) in enumerate(fields):
1648 add_item(f, reverse, closest,
1649 limits[i] if i < limit_count
1650 else limits[0] if has_limit and not has_multiple_limits
1651 else None)
1652
1653 def print_verbose_info(self, write_debug):
1654 if self._sort_user:
1655 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1656 if self._sort_extractor:
1657 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1658 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1659 '+' if self._get_field_setting(field, 'reverse') else '', field,
1660 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1661 self._get_field_setting(field, 'limit_text'),
1662 self._get_field_setting(field, 'limit'))
1663 if self._get_field_setting(field, 'limit_text') is not None else '')
1664 for field in self._order if self._get_field_setting(field, 'visible')]))
1665
1666 def _calculate_field_preference_from_value(self, format, field, type, value):
1667 reverse = self._get_field_setting(field, 'reverse')
1668 closest = self._get_field_setting(field, 'closest')
1669 limit = self._get_field_setting(field, 'limit')
1670
1671 if type == 'extractor':
1672 maximum = self._get_field_setting(field, 'max')
1673 if value is None or (maximum is not None and value >= maximum):
1674 value = -1
1675 elif type == 'boolean':
1676 in_list = self._get_field_setting(field, 'in_list')
1677 not_in_list = self._get_field_setting(field, 'not_in_list')
1678 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1679 elif type == 'ordered':
1680 value = self._resolve_field_value(field, value, True)
1681
1682 # try to convert to number
1683 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1684 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1685 if is_num:
1686 value = val_num
1687
1688 return ((-10, 0) if value is None
1689 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1690 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1691 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1692 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1693 else (-1, value, 0))
1694
1695 def _calculate_field_preference(self, format, field):
1696 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1697 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1698 if type == 'multiple':
1699 type = 'field' # Only 'field' is allowed in multiple for now
1700 actual_fields = self._get_field_setting(field, 'field')
1701
1702 def wrapped_function(values):
1703 values = tuple(filter(lambda x: x is not None, values))
1704 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1705 else values[0] if values
1706 else None)
1707
1708 value = wrapped_function((get_value(f) for f in actual_fields))
1709 else:
1710 value = get_value(field)
1711 return self._calculate_field_preference_from_value(format, field, type, value)
1712
1713 def calculate_preference(self, format):
1714 # Determine missing protocol
1715 if not format.get('protocol'):
1716 format['protocol'] = determine_protocol(format)
1717
1718 # Determine missing ext
1719 if not format.get('ext') and 'url' in format:
1720 format['ext'] = determine_ext(format['url'])
1721 if format.get('vcodec') == 'none':
1722 format['audio_ext'] = format['ext']
1723 format['video_ext'] = 'none'
1724 else:
1725 format['video_ext'] = format['ext']
1726 format['audio_ext'] = 'none'
1727 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1728 # format['preference'] = -1000
1729
1730 # Determine missing bitrates
1731 if format.get('tbr') is None:
1732 if format.get('vbr') is not None and format.get('abr') is not None:
1733 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1734 else:
1735 if format.get('vcodec') != "none" and format.get('vbr') is None:
1736 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1737 if format.get('acodec') != "none" and format.get('abr') is None:
1738 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1739
1740 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1741
1742 def _sort_formats(self, formats, field_preference=[]):
1743 if not formats:
1744 if self.get_param('ignore_no_formats_error'):
1745 return
1746 raise ExtractorError('No video formats found')
1747 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1748 format_sort.evaluate_params(self._downloader.params, field_preference)
1749 if self.get_param('verbose', False):
1750 format_sort.print_verbose_info(self._downloader.write_debug)
1751 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1752
1753 def _check_formats(self, formats, video_id):
1754 if formats:
1755 formats[:] = filter(
1756 lambda f: self._is_valid_url(
1757 f['url'], video_id,
1758 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1759 formats)
1760
1761 @staticmethod
1762 def _remove_duplicate_formats(formats):
1763 format_urls = set()
1764 unique_formats = []
1765 for f in formats:
1766 if f['url'] not in format_urls:
1767 format_urls.add(f['url'])
1768 unique_formats.append(f)
1769 formats[:] = unique_formats
1770
1771 def _is_valid_url(self, url, video_id, item='video', headers={}):
1772 url = self._proto_relative_url(url, scheme='http:')
1773 # For now assume non HTTP(S) URLs always valid
1774 if not (url.startswith('http://') or url.startswith('https://')):
1775 return True
1776 try:
1777 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1778 return True
1779 except ExtractorError as e:
1780 self.to_screen(
1781 '%s: %s URL is invalid, skipping: %s'
1782 % (video_id, item, error_to_compat_str(e.cause)))
1783 return False
1784
1785 def http_scheme(self):
1786 """ Either "http:" or "https:", depending on the user's preferences """
1787 return (
1788 'http:'
1789 if self.get_param('prefer_insecure', False)
1790 else 'https:')
1791
1792 def _proto_relative_url(self, url, scheme=None):
1793 if url is None:
1794 return url
1795 if url.startswith('//'):
1796 if scheme is None:
1797 scheme = self.http_scheme()
1798 return scheme + url
1799 else:
1800 return url
1801
1802 def _sleep(self, timeout, video_id, msg_template=None):
1803 if msg_template is None:
1804 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1805 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1806 self.to_screen(msg)
1807 time.sleep(timeout)
1808
1809 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1810 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1811 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1812 manifest = self._download_xml(
1813 manifest_url, video_id, 'Downloading f4m manifest',
1814 'Unable to download f4m manifest',
1815 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1816 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1817 transform_source=transform_source,
1818 fatal=fatal, data=data, headers=headers, query=query)
1819
1820 if manifest is False:
1821 return []
1822
1823 return self._parse_f4m_formats(
1824 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1825 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1826
1827 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1828 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1829 fatal=True, m3u8_id=None):
1830 if not isinstance(manifest, compat_etree_Element) and not fatal:
1831 return []
1832
1833 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1834 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1835 if akamai_pv is not None and ';' in akamai_pv.text:
1836 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1837 if playerVerificationChallenge.strip() != '':
1838 return []
1839
1840 formats = []
1841 manifest_version = '1.0'
1842 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1843 if not media_nodes:
1844 manifest_version = '2.0'
1845 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1846 # Remove unsupported DRM protected media from final formats
1847 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1848 media_nodes = remove_encrypted_media(media_nodes)
1849 if not media_nodes:
1850 return formats
1851
1852 manifest_base_url = get_base_url(manifest)
1853
1854 bootstrap_info = xpath_element(
1855 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1856 'bootstrap info', default=None)
1857
1858 vcodec = None
1859 mime_type = xpath_text(
1860 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1861 'base URL', default=None)
1862 if mime_type and mime_type.startswith('audio/'):
1863 vcodec = 'none'
1864
1865 for i, media_el in enumerate(media_nodes):
1866 tbr = int_or_none(media_el.attrib.get('bitrate'))
1867 width = int_or_none(media_el.attrib.get('width'))
1868 height = int_or_none(media_el.attrib.get('height'))
1869 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1870 # If <bootstrapInfo> is present, the specified f4m is a
1871 # stream-level manifest, and only set-level manifests may refer to
1872 # external resources. See section 11.4 and section 4 of F4M spec
1873 if bootstrap_info is None:
1874 media_url = None
1875 # @href is introduced in 2.0, see section 11.6 of F4M spec
1876 if manifest_version == '2.0':
1877 media_url = media_el.attrib.get('href')
1878 if media_url is None:
1879 media_url = media_el.attrib.get('url')
1880 if not media_url:
1881 continue
1882 manifest_url = (
1883 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1884 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1885 # If media_url is itself a f4m manifest do the recursive extraction
1886 # since bitrates in parent manifest (this one) and media_url manifest
1887 # may differ leading to inability to resolve the format by requested
1888 # bitrate in f4m downloader
1889 ext = determine_ext(manifest_url)
1890 if ext == 'f4m':
1891 f4m_formats = self._extract_f4m_formats(
1892 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1893 transform_source=transform_source, fatal=fatal)
1894 # Sometimes stream-level manifest contains single media entry that
1895 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1896 # At the same time parent's media entry in set-level manifest may
1897 # contain it. We will copy it from parent in such cases.
1898 if len(f4m_formats) == 1:
1899 f = f4m_formats[0]
1900 f.update({
1901 'tbr': f.get('tbr') or tbr,
1902 'width': f.get('width') or width,
1903 'height': f.get('height') or height,
1904 'format_id': f.get('format_id') if not tbr else format_id,
1905 'vcodec': vcodec,
1906 })
1907 formats.extend(f4m_formats)
1908 continue
1909 elif ext == 'm3u8':
1910 formats.extend(self._extract_m3u8_formats(
1911 manifest_url, video_id, 'mp4', preference=preference,
1912 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1913 continue
1914 formats.append({
1915 'format_id': format_id,
1916 'url': manifest_url,
1917 'manifest_url': manifest_url,
1918 'ext': 'flv' if bootstrap_info is not None else None,
1919 'protocol': 'f4m',
1920 'tbr': tbr,
1921 'width': width,
1922 'height': height,
1923 'vcodec': vcodec,
1924 'preference': preference,
1925 'quality': quality,
1926 })
1927 return formats
1928
1929 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1930 return {
1931 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1932 'url': m3u8_url,
1933 'ext': ext,
1934 'protocol': 'm3u8',
1935 'preference': preference - 100 if preference else -100,
1936 'quality': quality,
1937 'resolution': 'multiple',
1938 'format_note': 'Quality selection URL',
1939 }
1940
1941 def _extract_m3u8_formats(self, *args, **kwargs):
1942 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1943 if subs:
1944 self.report_warning(bug_reports_message(
1945 "Ignoring subtitle tracks found in the HLS manifest; "
1946 "if any subtitle tracks are missing,"
1947 ))
1948 return fmts
1949
1950 def _extract_m3u8_formats_and_subtitles(
1951 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1952 preference=None, quality=None, m3u8_id=None, note=None,
1953 errnote=None, fatal=True, live=False, data=None, headers={},
1954 query={}):
1955
1956 res = self._download_webpage_handle(
1957 m3u8_url, video_id,
1958 note='Downloading m3u8 information' if note is None else note,
1959 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1960 fatal=fatal, data=data, headers=headers, query=query)
1961
1962 if res is False:
1963 return [], {}
1964
1965 m3u8_doc, urlh = res
1966 m3u8_url = urlh.geturl()
1967
1968 return self._parse_m3u8_formats_and_subtitles(
1969 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1970 preference=preference, quality=quality, m3u8_id=m3u8_id,
1971 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1972 headers=headers, query=query, video_id=video_id)
1973
1974 def _parse_m3u8_formats_and_subtitles(
1975 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1976 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1977 errnote=None, fatal=True, data=None, headers={}, query={},
1978 video_id=None):
1979
1980 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1981 return [], {}
1982
1983 if (not self.get_param('allow_unplayable_formats')
1984 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
1985 return [], {}
1986
1987 formats = []
1988
1989 subtitles = {}
1990
1991 format_url = lambda u: (
1992 u
1993 if re.match(r'^https?://', u)
1994 else compat_urlparse.urljoin(m3u8_url, u))
1995
1996 split_discontinuity = self.get_param('hls_split_discontinuity', False)
1997
1998 # References:
1999 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2000 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2001 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2002
2003 # We should try extracting formats only from master playlists [1, 4.3.4],
2004 # i.e. playlists that describe available qualities. On the other hand
2005 # media playlists [1, 4.3.3] should be returned as is since they contain
2006 # just the media without qualities renditions.
2007 # Fortunately, master playlist can be easily distinguished from media
2008 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2009 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2010 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2011 # media playlist and MUST NOT appear in master playlist thus we can
2012 # clearly detect media playlist with this criterion.
2013
2014 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2015 fatal=True, data=None, headers={}):
2016 if not m3u8_doc:
2017 if not format_url:
2018 return []
2019 res = self._download_webpage_handle(
2020 format_url, video_id,
2021 note=False,
2022 errnote='Failed to download m3u8 playlist information',
2023 fatal=fatal, data=data, headers=headers)
2024
2025 if res is False:
2026 return []
2027
2028 m3u8_doc, urlh = res
2029 format_url = urlh.geturl()
2030
2031 playlist_formats = []
2032 i = (
2033 0
2034 if split_discontinuity
2035 else None)
2036 format_info = {
2037 'index': i,
2038 'key_data': None,
2039 'files': [],
2040 }
2041 for line in m3u8_doc.splitlines():
2042 if not line.startswith('#'):
2043 format_info['files'].append(line)
2044 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2045 i += 1
2046 playlist_formats.append(format_info)
2047 format_info = {
2048 'index': i,
2049 'url': format_url,
2050 'files': [],
2051 }
2052 playlist_formats.append(format_info)
2053 return playlist_formats
2054
2055 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2056
2057 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
2058
2059 for format in playlist_formats:
2060 format_id = []
2061 if m3u8_id:
2062 format_id.append(m3u8_id)
2063 format_index = format.get('index')
2064 if format_index:
2065 format_id.append(str(format_index))
2066 f = {
2067 'format_id': '-'.join(format_id),
2068 'format_index': format_index,
2069 'url': m3u8_url,
2070 'ext': ext,
2071 'protocol': entry_protocol,
2072 'preference': preference,
2073 'quality': quality,
2074 }
2075 formats.append(f)
2076
2077 return formats, subtitles
2078
2079 groups = {}
2080 last_stream_inf = {}
2081
2082 def extract_media(x_media_line):
2083 media = parse_m3u8_attributes(x_media_line)
2084 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2085 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2086 if not (media_type and group_id and name):
2087 return
2088 groups.setdefault(group_id, []).append(media)
2089 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2090 if media_type == 'SUBTITLES':
2091 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2092 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2093 # However, lack of URI has been spotted in the wild.
2094 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2095 if not media.get('URI'):
2096 return
2097 url = format_url(media['URI'])
2098 sub_info = {
2099 'url': url,
2100 'ext': determine_ext(url),
2101 }
2102 if sub_info['ext'] == 'm3u8':
2103 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2104 # files may contain is WebVTT:
2105 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2106 sub_info['ext'] = 'vtt'
2107 sub_info['protocol'] = 'm3u8_native'
2108 lang = media.get('LANGUAGE') or 'und'
2109 subtitles.setdefault(lang, []).append(sub_info)
2110 if media_type not in ('VIDEO', 'AUDIO'):
2111 return
2112 media_url = media.get('URI')
2113 if media_url:
2114 manifest_url = format_url(media_url)
2115 format_id = []
2116 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2117 fatal=fatal, data=data, headers=headers)
2118
2119 for format in playlist_formats:
2120 format_index = format.get('index')
2121 for v in (m3u8_id, group_id, name):
2122 if v:
2123 format_id.append(v)
2124 if format_index:
2125 format_id.append(str(format_index))
2126 f = {
2127 'format_id': '-'.join(format_id),
2128 'format_index': format_index,
2129 'url': manifest_url,
2130 'manifest_url': m3u8_url,
2131 'language': media.get('LANGUAGE'),
2132 'ext': ext,
2133 'protocol': entry_protocol,
2134 'preference': preference,
2135 'quality': quality,
2136 }
2137 if media_type == 'AUDIO':
2138 f['vcodec'] = 'none'
2139 formats.append(f)
2140
2141 def build_stream_name():
2142 # Despite specification does not mention NAME attribute for
2143 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2144 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2145 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2146 stream_name = last_stream_inf.get('NAME')
2147 if stream_name:
2148 return stream_name
2149 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2150 # from corresponding rendition group
2151 stream_group_id = last_stream_inf.get('VIDEO')
2152 if not stream_group_id:
2153 return
2154 stream_group = groups.get(stream_group_id)
2155 if not stream_group:
2156 return stream_group_id
2157 rendition = stream_group[0]
2158 return rendition.get('NAME') or stream_group_id
2159
2160 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2161 # chance to detect video only formats when EXT-X-STREAM-INF tags
2162 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2163 for line in m3u8_doc.splitlines():
2164 if line.startswith('#EXT-X-MEDIA:'):
2165 extract_media(line)
2166
2167 for line in m3u8_doc.splitlines():
2168 if line.startswith('#EXT-X-STREAM-INF:'):
2169 last_stream_inf = parse_m3u8_attributes(line)
2170 elif line.startswith('#') or not line.strip():
2171 continue
2172 else:
2173 tbr = float_or_none(
2174 last_stream_inf.get('AVERAGE-BANDWIDTH')
2175 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2176 manifest_url = format_url(line.strip())
2177
2178 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2179 fatal=fatal, data=data, headers=headers)
2180
2181 for frmt in playlist_formats:
2182 format_id = []
2183 if m3u8_id:
2184 format_id.append(m3u8_id)
2185 format_index = frmt.get('index')
2186 stream_name = build_stream_name()
2187 # Bandwidth of live streams may differ over time thus making
2188 # format_id unpredictable. So it's better to keep provided
2189 # format_id intact.
2190 if not live:
2191 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2192 if format_index:
2193 format_id.append(str(format_index))
2194 f = {
2195 'format_id': '-'.join(format_id),
2196 'format_index': format_index,
2197 'url': manifest_url,
2198 'manifest_url': m3u8_url,
2199 'tbr': tbr,
2200 'ext': ext,
2201 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2202 'protocol': entry_protocol,
2203 'preference': preference,
2204 'quality': quality,
2205 }
2206 resolution = last_stream_inf.get('RESOLUTION')
2207 if resolution:
2208 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2209 if mobj:
2210 f['width'] = int(mobj.group('width'))
2211 f['height'] = int(mobj.group('height'))
2212 # Unified Streaming Platform
2213 mobj = re.search(
2214 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2215 if mobj:
2216 abr, vbr = mobj.groups()
2217 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2218 f.update({
2219 'vbr': vbr,
2220 'abr': abr,
2221 })
2222 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2223 f.update(codecs)
2224 audio_group_id = last_stream_inf.get('AUDIO')
2225 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2226 # references a rendition group MUST have a CODECS attribute.
2227 # However, this is not always respected, for example, [2]
2228 # contains EXT-X-STREAM-INF tag which references AUDIO
2229 # rendition group but does not have CODECS and despite
2230 # referencing an audio group it represents a complete
2231 # (with audio and video) format. So, for such cases we will
2232 # ignore references to rendition groups and treat them
2233 # as complete formats.
2234 if audio_group_id and codecs and f.get('vcodec') != 'none':
2235 audio_group = groups.get(audio_group_id)
2236 if audio_group and audio_group[0].get('URI'):
2237 # TODO: update acodec for audio only formats with
2238 # the same GROUP-ID
2239 f['acodec'] = 'none'
2240 if not f.get('ext'):
2241 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2242 formats.append(f)
2243
2244 # for DailyMotion
2245 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2246 if progressive_uri:
2247 http_f = f.copy()
2248 del http_f['manifest_url']
2249 http_f.update({
2250 'format_id': f['format_id'].replace('hls-', 'http-'),
2251 'protocol': 'http',
2252 'url': progressive_uri,
2253 })
2254 formats.append(http_f)
2255
2256 last_stream_inf = {}
2257 return formats, subtitles
2258
2259 @staticmethod
2260 def _xpath_ns(path, namespace=None):
2261 if not namespace:
2262 return path
2263 out = []
2264 for c in path.split('/'):
2265 if not c or c == '.':
2266 out.append(c)
2267 else:
2268 out.append('{%s}%s' % (namespace, c))
2269 return '/'.join(out)
2270
2271 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2272 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2273
2274 if smil is False:
2275 assert not fatal
2276 return []
2277
2278 namespace = self._parse_smil_namespace(smil)
2279
2280 return self._parse_smil_formats(
2281 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2282
2283 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2284 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2285 if smil is False:
2286 return {}
2287 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2288
2289 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2290 return self._download_xml(
2291 smil_url, video_id, 'Downloading SMIL file',
2292 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2293
2294 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2295 namespace = self._parse_smil_namespace(smil)
2296
2297 formats = self._parse_smil_formats(
2298 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2299 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2300
2301 video_id = os.path.splitext(url_basename(smil_url))[0]
2302 title = None
2303 description = None
2304 upload_date = None
2305 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2306 name = meta.attrib.get('name')
2307 content = meta.attrib.get('content')
2308 if not name or not content:
2309 continue
2310 if not title and name == 'title':
2311 title = content
2312 elif not description and name in ('description', 'abstract'):
2313 description = content
2314 elif not upload_date and name == 'date':
2315 upload_date = unified_strdate(content)
2316
2317 thumbnails = [{
2318 'id': image.get('type'),
2319 'url': image.get('src'),
2320 'width': int_or_none(image.get('width')),
2321 'height': int_or_none(image.get('height')),
2322 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2323
2324 return {
2325 'id': video_id,
2326 'title': title or video_id,
2327 'description': description,
2328 'upload_date': upload_date,
2329 'thumbnails': thumbnails,
2330 'formats': formats,
2331 'subtitles': subtitles,
2332 }
2333
2334 def _parse_smil_namespace(self, smil):
2335 return self._search_regex(
2336 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2337
2338 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2339 base = smil_url
2340 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2341 b = meta.get('base') or meta.get('httpBase')
2342 if b:
2343 base = b
2344 break
2345
2346 formats = []
2347 rtmp_count = 0
2348 http_count = 0
2349 m3u8_count = 0
2350
2351 srcs = []
2352 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2353 for medium in media:
2354 src = medium.get('src')
2355 if not src or src in srcs:
2356 continue
2357 srcs.append(src)
2358
2359 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2360 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2361 width = int_or_none(medium.get('width'))
2362 height = int_or_none(medium.get('height'))
2363 proto = medium.get('proto')
2364 ext = medium.get('ext')
2365 src_ext = determine_ext(src)
2366 streamer = medium.get('streamer') or base
2367
2368 if proto == 'rtmp' or streamer.startswith('rtmp'):
2369 rtmp_count += 1
2370 formats.append({
2371 'url': streamer,
2372 'play_path': src,
2373 'ext': 'flv',
2374 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2375 'tbr': bitrate,
2376 'filesize': filesize,
2377 'width': width,
2378 'height': height,
2379 })
2380 if transform_rtmp_url:
2381 streamer, src = transform_rtmp_url(streamer, src)
2382 formats[-1].update({
2383 'url': streamer,
2384 'play_path': src,
2385 })
2386 continue
2387
2388 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2389 src_url = src_url.strip()
2390
2391 if proto == 'm3u8' or src_ext == 'm3u8':
2392 m3u8_formats = self._extract_m3u8_formats(
2393 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2394 if len(m3u8_formats) == 1:
2395 m3u8_count += 1
2396 m3u8_formats[0].update({
2397 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2398 'tbr': bitrate,
2399 'width': width,
2400 'height': height,
2401 })
2402 formats.extend(m3u8_formats)
2403 elif src_ext == 'f4m':
2404 f4m_url = src_url
2405 if not f4m_params:
2406 f4m_params = {
2407 'hdcore': '3.2.0',
2408 'plugin': 'flowplayer-3.2.0.1',
2409 }
2410 f4m_url += '&' if '?' in f4m_url else '?'
2411 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2412 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2413 elif src_ext == 'mpd':
2414 formats.extend(self._extract_mpd_formats(
2415 src_url, video_id, mpd_id='dash', fatal=False))
2416 elif re.search(r'\.ism/[Mm]anifest', src_url):
2417 formats.extend(self._extract_ism_formats(
2418 src_url, video_id, ism_id='mss', fatal=False))
2419 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2420 http_count += 1
2421 formats.append({
2422 'url': src_url,
2423 'ext': ext or src_ext or 'flv',
2424 'format_id': 'http-%d' % (bitrate or http_count),
2425 'tbr': bitrate,
2426 'filesize': filesize,
2427 'width': width,
2428 'height': height,
2429 })
2430
2431 return formats
2432
2433 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2434 urls = []
2435 subtitles = {}
2436 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437 src = textstream.get('src')
2438 if not src or src in urls:
2439 continue
2440 urls.append(src)
2441 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2442 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2443 subtitles.setdefault(lang, []).append({
2444 'url': src,
2445 'ext': ext,
2446 })
2447 return subtitles
2448
2449 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2450 xspf = self._download_xml(
2451 xspf_url, playlist_id, 'Downloading xpsf playlist',
2452 'Unable to download xspf manifest', fatal=fatal)
2453 if xspf is False:
2454 return []
2455 return self._parse_xspf(
2456 xspf, playlist_id, xspf_url=xspf_url,
2457 xspf_base_url=base_url(xspf_url))
2458
2459 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2460 NS_MAP = {
2461 'xspf': 'http://xspf.org/ns/0/',
2462 's1': 'http://static.streamone.nl/player/ns/0',
2463 }
2464
2465 entries = []
2466 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2467 title = xpath_text(
2468 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2469 description = xpath_text(
2470 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2471 thumbnail = xpath_text(
2472 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2473 duration = float_or_none(
2474 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2475
2476 formats = []
2477 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2478 format_url = urljoin(xspf_base_url, location.text)
2479 if not format_url:
2480 continue
2481 formats.append({
2482 'url': format_url,
2483 'manifest_url': xspf_url,
2484 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2485 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2486 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2487 })
2488 self._sort_formats(formats)
2489
2490 entries.append({
2491 'id': playlist_id,
2492 'title': title,
2493 'description': description,
2494 'thumbnail': thumbnail,
2495 'duration': duration,
2496 'formats': formats,
2497 })
2498 return entries
2499
2500 def _extract_mpd_formats(self, *args, **kwargs):
2501 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2502 if subs:
2503 self.report_warning(bug_reports_message(
2504 "Ignoring subtitle tracks found in the DASH manifest; "
2505 "if any subtitle tracks are missing,"
2506 ))
2507 return fmts
2508
2509 def _extract_mpd_formats_and_subtitles(
2510 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511 fatal=True, data=None, headers={}, query={}):
2512 res = self._download_xml_handle(
2513 mpd_url, video_id,
2514 note='Downloading MPD manifest' if note is None else note,
2515 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2516 fatal=fatal, data=data, headers=headers, query=query)
2517 if res is False:
2518 return [], {}
2519 mpd_doc, urlh = res
2520 if mpd_doc is None:
2521 return [], {}
2522 mpd_base_url = base_url(urlh.geturl())
2523
2524 return self._parse_mpd_formats_and_subtitles(
2525 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2526
2527 def _parse_mpd_formats(self, *args, **kwargs):
2528 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2529 if subs:
2530 self.report_warning(bug_reports_message(
2531 "Ignoring subtitle tracks found in the DASH manifest; "
2532 "if any subtitle tracks are missing,"
2533 ))
2534 return fmts
2535
2536 def _parse_mpd_formats_and_subtitles(
2537 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2538 """
2539 Parse formats from MPD manifest.
2540 References:
2541 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2542 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2543 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2544 """
2545 if not self.get_param('dynamic_mpd', True):
2546 if mpd_doc.get('type') == 'dynamic':
2547 return [], {}
2548
2549 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2550
2551 def _add_ns(path):
2552 return self._xpath_ns(path, namespace)
2553
2554 def is_drm_protected(element):
2555 return element.find(_add_ns('ContentProtection')) is not None
2556
2557 def extract_multisegment_info(element, ms_parent_info):
2558 ms_info = ms_parent_info.copy()
2559
2560 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2561 # common attributes and elements. We will only extract relevant
2562 # for us.
2563 def extract_common(source):
2564 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2565 if segment_timeline is not None:
2566 s_e = segment_timeline.findall(_add_ns('S'))
2567 if s_e:
2568 ms_info['total_number'] = 0
2569 ms_info['s'] = []
2570 for s in s_e:
2571 r = int(s.get('r', 0))
2572 ms_info['total_number'] += 1 + r
2573 ms_info['s'].append({
2574 't': int(s.get('t', 0)),
2575 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2576 'd': int(s.attrib['d']),
2577 'r': r,
2578 })
2579 start_number = source.get('startNumber')
2580 if start_number:
2581 ms_info['start_number'] = int(start_number)
2582 timescale = source.get('timescale')
2583 if timescale:
2584 ms_info['timescale'] = int(timescale)
2585 segment_duration = source.get('duration')
2586 if segment_duration:
2587 ms_info['segment_duration'] = float(segment_duration)
2588
2589 def extract_Initialization(source):
2590 initialization = source.find(_add_ns('Initialization'))
2591 if initialization is not None:
2592 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2593
2594 segment_list = element.find(_add_ns('SegmentList'))
2595 if segment_list is not None:
2596 extract_common(segment_list)
2597 extract_Initialization(segment_list)
2598 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2599 if segment_urls_e:
2600 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2601 else:
2602 segment_template = element.find(_add_ns('SegmentTemplate'))
2603 if segment_template is not None:
2604 extract_common(segment_template)
2605 media = segment_template.get('media')
2606 if media:
2607 ms_info['media'] = media
2608 initialization = segment_template.get('initialization')
2609 if initialization:
2610 ms_info['initialization'] = initialization
2611 else:
2612 extract_Initialization(segment_template)
2613 return ms_info
2614
2615 skip_unplayable = not self.get_param('allow_unplayable_formats')
2616
2617 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2618 formats = []
2619 subtitles = {}
2620 for period in mpd_doc.findall(_add_ns('Period')):
2621 period_duration = parse_duration(period.get('duration')) or mpd_duration
2622 period_ms_info = extract_multisegment_info(period, {
2623 'start_number': 1,
2624 'timescale': 1,
2625 })
2626 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2627 if skip_unplayable and is_drm_protected(adaptation_set):
2628 continue
2629 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2630 for representation in adaptation_set.findall(_add_ns('Representation')):
2631 if skip_unplayable and is_drm_protected(representation):
2632 continue
2633 representation_attrib = adaptation_set.attrib.copy()
2634 representation_attrib.update(representation.attrib)
2635 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2636 mime_type = representation_attrib['mimeType']
2637 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2638
2639 if content_type in ('video', 'audio', 'text'):
2640 base_url = ''
2641 for element in (representation, adaptation_set, period, mpd_doc):
2642 base_url_e = element.find(_add_ns('BaseURL'))
2643 if base_url_e is not None:
2644 base_url = base_url_e.text + base_url
2645 if re.match(r'^https?://', base_url):
2646 break
2647 if mpd_base_url and not re.match(r'^https?://', base_url):
2648 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2649 mpd_base_url += '/'
2650 base_url = mpd_base_url + base_url
2651 representation_id = representation_attrib.get('id')
2652 lang = representation_attrib.get('lang')
2653 url_el = representation.find(_add_ns('BaseURL'))
2654 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2655 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2656 if content_type in ('video', 'audio'):
2657 f = {
2658 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2659 'manifest_url': mpd_url,
2660 'ext': mimetype2ext(mime_type),
2661 'width': int_or_none(representation_attrib.get('width')),
2662 'height': int_or_none(representation_attrib.get('height')),
2663 'tbr': float_or_none(bandwidth, 1000),
2664 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2665 'fps': int_or_none(representation_attrib.get('frameRate')),
2666 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2667 'format_note': 'DASH %s' % content_type,
2668 'filesize': filesize,
2669 'container': mimetype2ext(mime_type) + '_dash',
2670 }
2671 f.update(parse_codecs(representation_attrib.get('codecs')))
2672 elif content_type == 'text':
2673 f = {
2674 'ext': mimetype2ext(mime_type),
2675 'manifest_url': mpd_url,
2676 'filesize': filesize,
2677 }
2678 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2679
2680 def prepare_template(template_name, identifiers):
2681 tmpl = representation_ms_info[template_name]
2682 # First of, % characters outside $...$ templates
2683 # must be escaped by doubling for proper processing
2684 # by % operator string formatting used further (see
2685 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2686 t = ''
2687 in_template = False
2688 for c in tmpl:
2689 t += c
2690 if c == '$':
2691 in_template = not in_template
2692 elif c == '%' and not in_template:
2693 t += c
2694 # Next, $...$ templates are translated to their
2695 # %(...) counterparts to be used with % operator
2696 t = t.replace('$RepresentationID$', representation_id)
2697 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2698 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2699 t.replace('$$', '$')
2700 return t
2701
2702 # @initialization is a regular template like @media one
2703 # so it should be handled just the same way (see
2704 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2705 if 'initialization' in representation_ms_info:
2706 initialization_template = prepare_template(
2707 'initialization',
2708 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2709 # $Time$ shall not be included for @initialization thus
2710 # only $Bandwidth$ remains
2711 ('Bandwidth', ))
2712 representation_ms_info['initialization_url'] = initialization_template % {
2713 'Bandwidth': bandwidth,
2714 }
2715
2716 def location_key(location):
2717 return 'url' if re.match(r'^https?://', location) else 'path'
2718
2719 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2720
2721 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2722 media_location_key = location_key(media_template)
2723
2724 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2725 # can't be used at the same time
2726 if '%(Number' in media_template and 's' not in representation_ms_info:
2727 segment_duration = None
2728 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2729 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2730 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2731 representation_ms_info['fragments'] = [{
2732 media_location_key: media_template % {
2733 'Number': segment_number,
2734 'Bandwidth': bandwidth,
2735 },
2736 'duration': segment_duration,
2737 } for segment_number in range(
2738 representation_ms_info['start_number'],
2739 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2740 else:
2741 # $Number*$ or $Time$ in media template with S list available
2742 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2743 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2744 representation_ms_info['fragments'] = []
2745 segment_time = 0
2746 segment_d = None
2747 segment_number = representation_ms_info['start_number']
2748
2749 def add_segment_url():
2750 segment_url = media_template % {
2751 'Time': segment_time,
2752 'Bandwidth': bandwidth,
2753 'Number': segment_number,
2754 }
2755 representation_ms_info['fragments'].append({
2756 media_location_key: segment_url,
2757 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2758 })
2759
2760 for num, s in enumerate(representation_ms_info['s']):
2761 segment_time = s.get('t') or segment_time
2762 segment_d = s['d']
2763 add_segment_url()
2764 segment_number += 1
2765 for r in range(s.get('r', 0)):
2766 segment_time += segment_d
2767 add_segment_url()
2768 segment_number += 1
2769 segment_time += segment_d
2770 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2771 # No media template
2772 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2773 # or any YouTube dashsegments video
2774 fragments = []
2775 segment_index = 0
2776 timescale = representation_ms_info['timescale']
2777 for s in representation_ms_info['s']:
2778 duration = float_or_none(s['d'], timescale)
2779 for r in range(s.get('r', 0) + 1):
2780 segment_uri = representation_ms_info['segment_urls'][segment_index]
2781 fragments.append({
2782 location_key(segment_uri): segment_uri,
2783 'duration': duration,
2784 })
2785 segment_index += 1
2786 representation_ms_info['fragments'] = fragments
2787 elif 'segment_urls' in representation_ms_info:
2788 # Segment URLs with no SegmentTimeline
2789 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2790 # https://github.com/ytdl-org/youtube-dl/pull/14844
2791 fragments = []
2792 segment_duration = float_or_none(
2793 representation_ms_info['segment_duration'],
2794 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2795 for segment_url in representation_ms_info['segment_urls']:
2796 fragment = {
2797 location_key(segment_url): segment_url,
2798 }
2799 if segment_duration:
2800 fragment['duration'] = segment_duration
2801 fragments.append(fragment)
2802 representation_ms_info['fragments'] = fragments
2803 # If there is a fragments key available then we correctly recognized fragmented media.
2804 # Otherwise we will assume unfragmented media with direct access. Technically, such
2805 # assumption is not necessarily correct since we may simply have no support for
2806 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2807 if 'fragments' in representation_ms_info:
2808 f.update({
2809 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2810 'url': mpd_url or base_url,
2811 'fragment_base_url': base_url,
2812 'fragments': [],
2813 'protocol': 'http_dash_segments',
2814 })
2815 if 'initialization_url' in representation_ms_info:
2816 initialization_url = representation_ms_info['initialization_url']
2817 if not f.get('url'):
2818 f['url'] = initialization_url
2819 f['fragments'].append({location_key(initialization_url): initialization_url})
2820 f['fragments'].extend(representation_ms_info['fragments'])
2821 else:
2822 # Assuming direct URL to unfragmented media.
2823 f['url'] = base_url
2824 if content_type in ('video', 'audio'):
2825 formats.append(f)
2826 elif content_type == 'text':
2827 subtitles.setdefault(lang or 'und', []).append(f)
2828 else:
2829 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2830 return formats, subtitles
2831
2832 def _extract_ism_formats(self, *args, **kwargs):
2833 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2834 if subs:
2835 self.report_warning(bug_reports_message(
2836 "Ignoring subtitle tracks found in the ISM manifest; "
2837 "if any subtitle tracks are missing,"
2838 ))
2839 return fmts
2840
2841 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2842 res = self._download_xml_handle(
2843 ism_url, video_id,
2844 note='Downloading ISM manifest' if note is None else note,
2845 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2846 fatal=fatal, data=data, headers=headers, query=query)
2847 if res is False:
2848 return [], {}
2849 ism_doc, urlh = res
2850 if ism_doc is None:
2851 return [], {}
2852
2853 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2854
2855 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2856 """
2857 Parse formats from ISM manifest.
2858 References:
2859 1. [MS-SSTR]: Smooth Streaming Protocol,
2860 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2861 """
2862 if ism_doc.get('IsLive') == 'TRUE':
2863 return [], {}
2864 if (not self.get_param('allow_unplayable_formats')
2865 and ism_doc.find('Protection') is not None):
2866 return [], {}
2867
2868 duration = int(ism_doc.attrib['Duration'])
2869 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2870
2871 formats = []
2872 subtitles = {}
2873 for stream in ism_doc.findall('StreamIndex'):
2874 stream_type = stream.get('Type')
2875 if stream_type not in ('video', 'audio', 'text'):
2876 continue
2877 url_pattern = stream.attrib['Url']
2878 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2879 stream_name = stream.get('Name')
2880 stream_language = stream.get('Language', 'und')
2881 for track in stream.findall('QualityLevel'):
2882 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2883 # TODO: add support for WVC1 and WMAP
2884 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2885 self.report_warning('%s is not a supported codec' % fourcc)
2886 continue
2887 tbr = int(track.attrib['Bitrate']) // 1000
2888 # [1] does not mention Width and Height attributes. However,
2889 # they're often present while MaxWidth and MaxHeight are
2890 # missing, so should be used as fallbacks
2891 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2892 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2893 sampling_rate = int_or_none(track.get('SamplingRate'))
2894
2895 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2896 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2897
2898 fragments = []
2899 fragment_ctx = {
2900 'time': 0,
2901 }
2902 stream_fragments = stream.findall('c')
2903 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2904 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2905 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2906 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2907 if not fragment_ctx['duration']:
2908 try:
2909 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2910 except IndexError:
2911 next_fragment_time = duration
2912 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2913 for _ in range(fragment_repeat):
2914 fragments.append({
2915 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2916 'duration': fragment_ctx['duration'] / stream_timescale,
2917 })
2918 fragment_ctx['time'] += fragment_ctx['duration']
2919
2920 format_id = []
2921 if ism_id:
2922 format_id.append(ism_id)
2923 if stream_name:
2924 format_id.append(stream_name)
2925 format_id.append(compat_str(tbr))
2926
2927 if stream_type == 'text':
2928 subtitles.setdefault(stream_language, []).append({
2929 'ext': 'ismt',
2930 'protocol': 'ism',
2931 'url': ism_url,
2932 'manifest_url': ism_url,
2933 'fragments': fragments,
2934 '_download_params': {
2935 'stream_type': stream_type,
2936 'duration': duration,
2937 'timescale': stream_timescale,
2938 'fourcc': fourcc,
2939 'language': stream_language,
2940 'codec_private_data': track.get('CodecPrivateData'),
2941 }
2942 })
2943 elif stream_type in ('video', 'audio'):
2944 formats.append({
2945 'format_id': '-'.join(format_id),
2946 'url': ism_url,
2947 'manifest_url': ism_url,
2948 'ext': 'ismv' if stream_type == 'video' else 'isma',
2949 'width': width,
2950 'height': height,
2951 'tbr': tbr,
2952 'asr': sampling_rate,
2953 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2954 'acodec': 'none' if stream_type == 'video' else fourcc,
2955 'protocol': 'ism',
2956 'fragments': fragments,
2957 '_download_params': {
2958 'stream_type': stream_type,
2959 'duration': duration,
2960 'timescale': stream_timescale,
2961 'width': width or 0,
2962 'height': height or 0,
2963 'fourcc': fourcc,
2964 'language': stream_language,
2965 'codec_private_data': track.get('CodecPrivateData'),
2966 'sampling_rate': sampling_rate,
2967 'channels': int_or_none(track.get('Channels', 2)),
2968 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2969 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2970 },
2971 })
2972 return formats, subtitles
2973
2974 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2975 def absolute_url(item_url):
2976 return urljoin(base_url, item_url)
2977
2978 def parse_content_type(content_type):
2979 if not content_type:
2980 return {}
2981 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2982 if ctr:
2983 mimetype, codecs = ctr.groups()
2984 f = parse_codecs(codecs)
2985 f['ext'] = mimetype2ext(mimetype)
2986 return f
2987 return {}
2988
2989 def _media_formats(src, cur_media_type, type_info={}):
2990 full_url = absolute_url(src)
2991 ext = type_info.get('ext') or determine_ext(full_url)
2992 if ext == 'm3u8':
2993 is_plain_url = False
2994 formats = self._extract_m3u8_formats(
2995 full_url, video_id, ext='mp4',
2996 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2997 preference=preference, quality=quality, fatal=False)
2998 elif ext == 'mpd':
2999 is_plain_url = False
3000 formats = self._extract_mpd_formats(
3001 full_url, video_id, mpd_id=mpd_id, fatal=False)
3002 else:
3003 is_plain_url = True
3004 formats = [{
3005 'url': full_url,
3006 'vcodec': 'none' if cur_media_type == 'audio' else None,
3007 }]
3008 return is_plain_url, formats
3009
3010 entries = []
3011 # amp-video and amp-audio are very similar to their HTML5 counterparts
3012 # so we wll include them right here (see
3013 # https://www.ampproject.org/docs/reference/components/amp-video)
3014 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3015 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3016 media_tags = [(media_tag, media_tag_name, media_type, '')
3017 for media_tag, media_tag_name, media_type
3018 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3019 media_tags.extend(re.findall(
3020 # We only allow video|audio followed by a whitespace or '>'.
3021 # Allowing more characters may end up in significant slow down (see
3022 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3023 # http://www.porntrex.com/maps/videositemap.xml).
3024 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3025 for media_tag, _, media_type, media_content in media_tags:
3026 media_info = {
3027 'formats': [],
3028 'subtitles': {},
3029 }
3030 media_attributes = extract_attributes(media_tag)
3031 src = strip_or_none(media_attributes.get('src'))
3032 if src:
3033 _, formats = _media_formats(src, media_type)
3034 media_info['formats'].extend(formats)
3035 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3036 if media_content:
3037 for source_tag in re.findall(r'<source[^>]+>', media_content):
3038 s_attr = extract_attributes(source_tag)
3039 # data-video-src and data-src are non standard but seen
3040 # several times in the wild
3041 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3042 if not src:
3043 continue
3044 f = parse_content_type(s_attr.get('type'))
3045 is_plain_url, formats = _media_formats(src, media_type, f)
3046 if is_plain_url:
3047 # width, height, res, label and title attributes are
3048 # all not standard but seen several times in the wild
3049 labels = [
3050 s_attr.get(lbl)
3051 for lbl in ('label', 'title')
3052 if str_or_none(s_attr.get(lbl))
3053 ]
3054 width = int_or_none(s_attr.get('width'))
3055 height = (int_or_none(s_attr.get('height'))
3056 or int_or_none(s_attr.get('res')))
3057 if not width or not height:
3058 for lbl in labels:
3059 resolution = parse_resolution(lbl)
3060 if not resolution:
3061 continue
3062 width = width or resolution.get('width')
3063 height = height or resolution.get('height')
3064 for lbl in labels:
3065 tbr = parse_bitrate(lbl)
3066 if tbr:
3067 break
3068 else:
3069 tbr = None
3070 f.update({
3071 'width': width,
3072 'height': height,
3073 'tbr': tbr,
3074 'format_id': s_attr.get('label') or s_attr.get('title'),
3075 })
3076 f.update(formats[0])
3077 media_info['formats'].append(f)
3078 else:
3079 media_info['formats'].extend(formats)
3080 for track_tag in re.findall(r'<track[^>]+>', media_content):
3081 track_attributes = extract_attributes(track_tag)
3082 kind = track_attributes.get('kind')
3083 if not kind or kind in ('subtitles', 'captions'):
3084 src = strip_or_none(track_attributes.get('src'))
3085 if not src:
3086 continue
3087 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3088 media_info['subtitles'].setdefault(lang, []).append({
3089 'url': absolute_url(src),
3090 })
3091 for f in media_info['formats']:
3092 f.setdefault('http_headers', {})['Referer'] = base_url
3093 if media_info['formats'] or media_info['subtitles']:
3094 entries.append(media_info)
3095 return entries
3096
3097 def _extract_akamai_formats(self, *args, **kwargs):
3098 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3099 if subs:
3100 self.report_warning(bug_reports_message(
3101 "Ignoring subtitle tracks found in the manifests; "
3102 "if any subtitle tracks are missing,"
3103 ))
3104 return fmts
3105
3106 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3107 signed = 'hdnea=' in manifest_url
3108 if not signed:
3109 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3110 manifest_url = re.sub(
3111 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3112 '', manifest_url).strip('?')
3113
3114 formats = []
3115 subtitles = {}
3116
3117 hdcore_sign = 'hdcore=3.7.0'
3118 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3119 hds_host = hosts.get('hds')
3120 if hds_host:
3121 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3122 if 'hdcore=' not in f4m_url:
3123 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3124 f4m_formats = self._extract_f4m_formats(
3125 f4m_url, video_id, f4m_id='hds', fatal=False)
3126 for entry in f4m_formats:
3127 entry.update({'extra_param_to_segment_url': hdcore_sign})
3128 formats.extend(f4m_formats)
3129
3130 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3131 hls_host = hosts.get('hls')
3132 if hls_host:
3133 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3134 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3135 m3u8_url, video_id, 'mp4', 'm3u8_native',
3136 m3u8_id='hls', fatal=False)
3137 formats.extend(m3u8_formats)
3138 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3139
3140 http_host = hosts.get('http')
3141 if http_host and m3u8_formats and not signed:
3142 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3143 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3144 qualities_length = len(qualities)
3145 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3146 i = 0
3147 for f in m3u8_formats:
3148 if f['vcodec'] != 'none':
3149 for protocol in ('http', 'https'):
3150 http_f = f.copy()
3151 del http_f['manifest_url']
3152 http_url = re.sub(
3153 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3154 http_f.update({
3155 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3156 'url': http_url,
3157 'protocol': protocol,
3158 })
3159 formats.append(http_f)
3160 i += 1
3161
3162 return formats, subtitles
3163
3164 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3165 query = compat_urlparse.urlparse(url).query
3166 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3167 mobj = re.search(
3168 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3169 url_base = mobj.group('url')
3170 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3171 formats = []
3172
3173 def manifest_url(manifest):
3174 m_url = '%s/%s' % (http_base_url, manifest)
3175 if query:
3176 m_url += '?%s' % query
3177 return m_url
3178
3179 if 'm3u8' not in skip_protocols:
3180 formats.extend(self._extract_m3u8_formats(
3181 manifest_url('playlist.m3u8'), video_id, 'mp4',
3182 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3183 if 'f4m' not in skip_protocols:
3184 formats.extend(self._extract_f4m_formats(
3185 manifest_url('manifest.f4m'),
3186 video_id, f4m_id='hds', fatal=False))
3187 if 'dash' not in skip_protocols:
3188 formats.extend(self._extract_mpd_formats(
3189 manifest_url('manifest.mpd'),
3190 video_id, mpd_id='dash', fatal=False))
3191 if re.search(r'(?:/smil:|\.smil)', url_base):
3192 if 'smil' not in skip_protocols:
3193 rtmp_formats = self._extract_smil_formats(
3194 manifest_url('jwplayer.smil'),
3195 video_id, fatal=False)
3196 for rtmp_format in rtmp_formats:
3197 rtsp_format = rtmp_format.copy()
3198 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3199 del rtsp_format['play_path']
3200 del rtsp_format['ext']
3201 rtsp_format.update({
3202 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3203 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3204 'protocol': 'rtsp',
3205 })
3206 formats.extend([rtmp_format, rtsp_format])
3207 else:
3208 for protocol in ('rtmp', 'rtsp'):
3209 if protocol not in skip_protocols:
3210 formats.append({
3211 'url': '%s:%s' % (protocol, url_base),
3212 'format_id': protocol,
3213 'protocol': protocol,
3214 })
3215 return formats
3216
3217 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3218 mobj = re.search(
3219 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3220 webpage)
3221 if mobj:
3222 try:
3223 jwplayer_data = self._parse_json(mobj.group('options'),
3224 video_id=video_id,
3225 transform_source=transform_source)
3226 except ExtractorError:
3227 pass
3228 else:
3229 if isinstance(jwplayer_data, dict):
3230 return jwplayer_data
3231
3232 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3233 jwplayer_data = self._find_jwplayer_data(
3234 webpage, video_id, transform_source=js_to_json)
3235 return self._parse_jwplayer_data(
3236 jwplayer_data, video_id, *args, **kwargs)
3237
3238 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3239 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3240 # JWPlayer backward compatibility: flattened playlists
3241 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3242 if 'playlist' not in jwplayer_data:
3243 jwplayer_data = {'playlist': [jwplayer_data]}
3244
3245 entries = []
3246
3247 # JWPlayer backward compatibility: single playlist item
3248 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3249 if not isinstance(jwplayer_data['playlist'], list):
3250 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3251
3252 for video_data in jwplayer_data['playlist']:
3253 # JWPlayer backward compatibility: flattened sources
3254 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3255 if 'sources' not in video_data:
3256 video_data['sources'] = [video_data]
3257
3258 this_video_id = video_id or video_data['mediaid']
3259
3260 formats = self._parse_jwplayer_formats(
3261 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3262 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3263
3264 subtitles = {}
3265 tracks = video_data.get('tracks')
3266 if tracks and isinstance(tracks, list):
3267 for track in tracks:
3268 if not isinstance(track, dict):
3269 continue
3270 track_kind = track.get('kind')
3271 if not track_kind or not isinstance(track_kind, compat_str):
3272 continue
3273 if track_kind.lower() not in ('captions', 'subtitles'):
3274 continue
3275 track_url = urljoin(base_url, track.get('file'))
3276 if not track_url:
3277 continue
3278 subtitles.setdefault(track.get('label') or 'en', []).append({
3279 'url': self._proto_relative_url(track_url)
3280 })
3281
3282 entry = {
3283 'id': this_video_id,
3284 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3285 'description': clean_html(video_data.get('description')),
3286 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3287 'timestamp': int_or_none(video_data.get('pubdate')),
3288 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3289 'subtitles': subtitles,
3290 }
3291 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3292 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3293 entry.update({
3294 '_type': 'url_transparent',
3295 'url': formats[0]['url'],
3296 })
3297 else:
3298 self._sort_formats(formats)
3299 entry['formats'] = formats
3300 entries.append(entry)
3301 if len(entries) == 1:
3302 return entries[0]
3303 else:
3304 return self.playlist_result(entries)
3305
3306 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3307 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3308 urls = []
3309 formats = []
3310 for source in jwplayer_sources_data:
3311 if not isinstance(source, dict):
3312 continue
3313 source_url = urljoin(
3314 base_url, self._proto_relative_url(source.get('file')))
3315 if not source_url or source_url in urls:
3316 continue
3317 urls.append(source_url)
3318 source_type = source.get('type') or ''
3319 ext = mimetype2ext(source_type) or determine_ext(source_url)
3320 if source_type == 'hls' or ext == 'm3u8':
3321 formats.extend(self._extract_m3u8_formats(
3322 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3323 m3u8_id=m3u8_id, fatal=False))
3324 elif source_type == 'dash' or ext == 'mpd':
3325 formats.extend(self._extract_mpd_formats(
3326 source_url, video_id, mpd_id=mpd_id, fatal=False))
3327 elif ext == 'smil':
3328 formats.extend(self._extract_smil_formats(
3329 source_url, video_id, fatal=False))
3330 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3331 elif source_type.startswith('audio') or ext in (
3332 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3333 formats.append({
3334 'url': source_url,
3335 'vcodec': 'none',
3336 'ext': ext,
3337 })
3338 else:
3339 height = int_or_none(source.get('height'))
3340 if height is None:
3341 # Often no height is provided but there is a label in
3342 # format like "1080p", "720p SD", or 1080.
3343 height = int_or_none(self._search_regex(
3344 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3345 'height', default=None))
3346 a_format = {
3347 'url': source_url,
3348 'width': int_or_none(source.get('width')),
3349 'height': height,
3350 'tbr': int_or_none(source.get('bitrate')),
3351 'ext': ext,
3352 }
3353 if source_url.startswith('rtmp'):
3354 a_format['ext'] = 'flv'
3355 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3356 # of jwplayer.flash.swf
3357 rtmp_url_parts = re.split(
3358 r'((?:mp4|mp3|flv):)', source_url, 1)
3359 if len(rtmp_url_parts) == 3:
3360 rtmp_url, prefix, play_path = rtmp_url_parts
3361 a_format.update({
3362 'url': rtmp_url,
3363 'play_path': prefix + play_path,
3364 })
3365 if rtmp_params:
3366 a_format.update(rtmp_params)
3367 formats.append(a_format)
3368 return formats
3369
3370 def _live_title(self, name):
3371 """ Generate the title for a live video """
3372 now = datetime.datetime.now()
3373 now_str = now.strftime('%Y-%m-%d %H:%M')
3374 return name + ' ' + now_str
3375
3376 def _int(self, v, name, fatal=False, **kwargs):
3377 res = int_or_none(v, **kwargs)
3378 if 'get_attr' in kwargs:
3379 print(getattr(v, kwargs['get_attr']))
3380 if res is None:
3381 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3382 if fatal:
3383 raise ExtractorError(msg)
3384 else:
3385 self.report_warning(msg)
3386 return res
3387
3388 def _float(self, v, name, fatal=False, **kwargs):
3389 res = float_or_none(v, **kwargs)
3390 if res is None:
3391 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3392 if fatal:
3393 raise ExtractorError(msg)
3394 else:
3395 self.report_warning(msg)
3396 return res
3397
3398 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3399 path='/', secure=False, discard=False, rest={}, **kwargs):
3400 cookie = compat_cookiejar_Cookie(
3401 0, name, value, port, port is not None, domain, True,
3402 domain.startswith('.'), path, True, secure, expire_time,
3403 discard, None, None, rest)
3404 self._downloader.cookiejar.set_cookie(cookie)
3405
3406 def _get_cookies(self, url):
3407 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3408 req = sanitized_Request(url)
3409 self._downloader.cookiejar.add_cookie_header(req)
3410 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3411
3412 def _apply_first_set_cookie_header(self, url_handle, cookie):
3413 """
3414 Apply first Set-Cookie header instead of the last. Experimental.
3415
3416 Some sites (e.g. [1-3]) may serve two cookies under the same name
3417 in Set-Cookie header and expect the first (old) one to be set rather
3418 than second (new). However, as of RFC6265 the newer one cookie
3419 should be set into cookie store what actually happens.
3420 We will workaround this issue by resetting the cookie to
3421 the first one manually.
3422 1. https://new.vk.com/
3423 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3424 3. https://learning.oreilly.com/
3425 """
3426 for header, cookies in url_handle.headers.items():
3427 if header.lower() != 'set-cookie':
3428 continue
3429 if sys.version_info[0] >= 3:
3430 cookies = cookies.encode('iso-8859-1')
3431 cookies = cookies.decode('utf-8')
3432 cookie_value = re.search(
3433 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3434 if cookie_value:
3435 value, domain = cookie_value.groups()
3436 self._set_cookie(domain, cookie, value)
3437 break
3438
3439 def get_testcases(self, include_onlymatching=False):
3440 t = getattr(self, '_TEST', None)
3441 if t:
3442 assert not hasattr(self, '_TESTS'), \
3443 '%s has _TEST and _TESTS' % type(self).__name__
3444 tests = [t]
3445 else:
3446 tests = getattr(self, '_TESTS', [])
3447 for t in tests:
3448 if not include_onlymatching and t.get('only_matching', False):
3449 continue
3450 t['name'] = type(self).__name__[:-len('IE')]
3451 yield t
3452
3453 def is_suitable(self, age_limit):
3454 """ Test whether the extractor is generally suitable for the given
3455 age limit (i.e. pornographic sites are not, all others usually are) """
3456
3457 any_restricted = False
3458 for tc in self.get_testcases(include_onlymatching=False):
3459 if tc.get('playlist', []):
3460 tc = tc['playlist'][0]
3461 is_restricted = age_restricted(
3462 tc.get('info_dict', {}).get('age_limit'), age_limit)
3463 if not is_restricted:
3464 return True
3465 any_restricted = any_restricted or is_restricted
3466 return not any_restricted
3467
3468 def extract_subtitles(self, *args, **kwargs):
3469 if (self.get_param('writesubtitles', False)
3470 or self.get_param('listsubtitles')):
3471 return self._get_subtitles(*args, **kwargs)
3472 return {}
3473
3474 def _get_subtitles(self, *args, **kwargs):
3475 raise NotImplementedError('This method must be implemented by subclasses')
3476
3477 @staticmethod
3478 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3479 """ Merge subtitle items for one language. Items with duplicated URLs
3480 will be dropped. """
3481 list1_urls = set([item['url'] for item in subtitle_list1])
3482 ret = list(subtitle_list1)
3483 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3484 return ret
3485
3486 @classmethod
3487 def _merge_subtitles(cls, *dicts, **kwargs):
3488 """ Merge subtitle dictionaries, language by language. """
3489
3490 target = (lambda target=None: target)(**kwargs)
3491 # The above lambda extracts the keyword argument 'target' from kwargs
3492 # while ensuring there are no stray ones. When Python 2 support
3493 # is dropped, remove it and change the function signature to:
3494 #
3495 # def _merge_subtitles(cls, *dicts, target=None):
3496
3497 if target is None:
3498 target = {}
3499 for d in dicts:
3500 for lang, subs in d.items():
3501 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3502 return target
3503
3504 def extract_automatic_captions(self, *args, **kwargs):
3505 if (self.get_param('writeautomaticsub', False)
3506 or self.get_param('listsubtitles')):
3507 return self._get_automatic_captions(*args, **kwargs)
3508 return {}
3509
3510 def _get_automatic_captions(self, *args, **kwargs):
3511 raise NotImplementedError('This method must be implemented by subclasses')
3512
3513 def mark_watched(self, *args, **kwargs):
3514 if (self.get_param('mark_watched', False)
3515 and (self._get_login_info()[0] is not None
3516 or self.get_param('cookiefile') is not None)):
3517 self._mark_watched(*args, **kwargs)
3518
3519 def _mark_watched(self, *args, **kwargs):
3520 raise NotImplementedError('This method must be implemented by subclasses')
3521
3522 def geo_verification_headers(self):
3523 headers = {}
3524 geo_verification_proxy = self.get_param('geo_verification_proxy')
3525 if geo_verification_proxy:
3526 headers['Ytdl-request-proxy'] = geo_verification_proxy
3527 return headers
3528
3529 def _generic_id(self, url):
3530 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3531
3532 def _generic_title(self, url):
3533 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3534
3535 @staticmethod
3536 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3537 all_known = all(map(
3538 lambda x: x is not None,
3539 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3540 return (
3541 'private' if is_private
3542 else 'premium_only' if needs_premium
3543 else 'subscriber_only' if needs_subscription
3544 else 'needs_auth' if needs_auth
3545 else 'unlisted' if is_unlisted
3546 else 'public' if all_known
3547 else None)
3548
3549
3550 class SearchInfoExtractor(InfoExtractor):
3551 """
3552 Base class for paged search queries extractors.
3553 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3554 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3555 """
3556
3557 @classmethod
3558 def _make_valid_url(cls):
3559 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3560
3561 @classmethod
3562 def suitable(cls, url):
3563 return re.match(cls._make_valid_url(), url) is not None
3564
3565 def _real_extract(self, query):
3566 mobj = re.match(self._make_valid_url(), query)
3567 if mobj is None:
3568 raise ExtractorError('Invalid search query "%s"' % query)
3569
3570 prefix = mobj.group('prefix')
3571 query = mobj.group('query')
3572 if prefix == '':
3573 return self._get_n_results(query, 1)
3574 elif prefix == 'all':
3575 return self._get_n_results(query, self._MAX_RESULTS)
3576 else:
3577 n = int(prefix)
3578 if n <= 0:
3579 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3580 elif n > self._MAX_RESULTS:
3581 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3582 n = self._MAX_RESULTS
3583 return self._get_n_results(query, n)
3584
3585 def _get_n_results(self, query, n):
3586 """Get a specified number of results for a query"""
3587 raise NotImplementedError('This method must be implemented by subclasses')
3588
3589 @property
3590 def SEARCH_KEY(self):
3591 return self._SEARCH_KEY