]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
3326d436bb817821b9cec6e1f88718cbaef00e84
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import ssl
14 import sys
15 import time
16 import math
17
18 from ..compat import (
19 compat_cookiejar_Cookie,
20 compat_cookies,
21 compat_etree_Element,
22 compat_etree_fromstring,
23 compat_getpass,
24 compat_integer_types,
25 compat_http_client,
26 compat_os_name,
27 compat_str,
28 compat_urllib_error,
29 compat_urllib_parse_unquote,
30 compat_urllib_parse_urlencode,
31 compat_urllib_request,
32 compat_urlparse,
33 compat_xml_parse_error,
34 )
35 from ..downloader import FileDownloader
36 from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39 )
40 from ..utils import (
41 NO_DEFAULT,
42 age_restricted,
43 base_url,
44 bug_reports_message,
45 clean_html,
46 compiled_regex_type,
47 determine_ext,
48 determine_protocol,
49 dict_get,
50 error_to_compat_str,
51 ExtractorError,
52 extract_attributes,
53 fix_xml_ampersands,
54 float_or_none,
55 GeoRestrictedError,
56 GeoUtils,
57 int_or_none,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 orderedSet,
62 parse_bitrate,
63 parse_codecs,
64 parse_duration,
65 parse_iso8601,
66 parse_m3u8_attributes,
67 parse_resolution,
68 RegexNotFoundError,
69 sanitized_Request,
70 sanitize_filename,
71 str_or_none,
72 str_to_int,
73 strip_or_none,
74 unescapeHTML,
75 unified_strdate,
76 unified_timestamp,
77 update_Request,
78 update_url_query,
79 urljoin,
80 url_basename,
81 url_or_none,
82 xpath_element,
83 xpath_text,
84 xpath_with_ns,
85 )
86
87
88 class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
95 passed to the YoutubeDL. The YoutubeDL processes this
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
99 The type field determines the type of the result.
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
104
105 id: Video identifier.
106 title: Video title, unescaped.
107
108 Additionally, it must contain either a formats entry or a url one:
109
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
124 is parsed from a string (in case of
125 fragmented media)
126 for MSS - URL of the ISM manifest.
127 * manifest_url
128 The URL of the manifest file in case of
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
134 * ext Will be calculated from URL if missing
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
146 * resolution Textual description of width and height
147 * tbr Average bitrate of audio and video in KBit/s
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
150 * asr Audio sampling rate in Hertz
151 * vbr Average video bitrate in KBit/s
152 * fps Frame rate
153 * vcodec Name of the video codec in use
154 * container Name of the container format
155 * filesize The number of bytes, if known in advance
156 * filesize_approx An estimate for the number of bytes
157 * player_url SWF Player URL (used for rtmpdump).
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
160 "http", "https", "rtsp", "rtmp", "rtmpe",
161 "m3u8", "m3u8_native" or "http_dash_segments".
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
177 * preference Order number of this format. If this field is
178 present and not None, the formats get sorted
179 by this field, regardless of all other values.
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
200 * stretched_ratio If given and not 1, indicates that the
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
207
208 url: Final video URL.
209 ext: Video filename extension.
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
212
213 The following fields are optional:
214
215 alt_title: A secondary title of the video.
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
220 thumbnails: A list of dictionaries, with the following entries:
221 * "id" (optional, string) - Thumbnail format ID
222 * "url"
223 * "preference" (optional, int) - quality of the image
224 * "width" (optional, int)
225 * "height" (optional, int)
226 * "resolution" (optional, string "{width}x{height}",
227 deprecated)
228 * "filesize" (optional, int)
229 thumbnail: Full URL to a video thumbnail image.
230 description: Full video description.
231 uploader: Full name of the video uploader.
232 license: License name the video is licensed under.
233 creator: The creator of the video.
234 release_date: The date (YYYYMMDD) when the video was released.
235 timestamp: UNIX timestamp of the moment the video became available.
236 upload_date: Video upload date (YYYYMMDD).
237 If not explicitly set, calculated from timestamp.
238 uploader_id: Nickname or id of the video uploader.
239 uploader_url: Full URL to a personal webpage of the video uploader.
240 channel: Full name of the channel the video is uploaded on.
241 Note that channel fields may or may not repeat uploader
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
245 location: Physical location where the video was filmed.
246 subtitles: The available subtitles as a dictionary in the format
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
251 * "data": The subtitles file contents
252 * "url": A URL pointing to the subtitles file
253 "ext" will be calculated from URL if missing
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
256 duration: Length of the video in seconds, as an integer or float.
257 view_count: How many users have watched the video on the platform.
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
260 repost_count: Number of reposts of the video
261 average_rating: Average rating give by users, the scale used depends on the webpage
262 comment_count: Number of comments on the video
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
267 * "id" - Comment ID
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
274 age_limit: Age restriction for the video, as an integer (years)
275 webpage_url: The URL to the video webpage, if given to yt-dlp it
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
278 categories: A list of categories that the video falls in, for example
279 ["Sports", "Berlin"]
280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
285 start_time: Time in seconds where the reproduction should start, as
286 specified in the URL.
287 end_time: Time in seconds where the reproduction should end, as
288 specified in the URL.
289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
293 playable_in_embed: Whether this video is allowed to play in embedded
294 players on other sites. Can be True (=always allowed),
295 False (=never allowed), None (=unknown), or a string
296 specifying the criteria for embedability (Eg: 'whitelist').
297 __post_extractor: A function to be called just before the metadata is
298 written to either disk, logger or console. The function
299 must return a dict which will be added to the info_dict.
300 This is usefull for additional information that is
301 time-consuming to extract. Note that the fields thus
302 extracted will not be available to output template and
303 match_filter. So, only "comments" and "comment_count" are
304 currently allowed to be extracted via this method.
305
306 The following fields should only be used when the video belongs to some logical
307 chapter or section:
308
309 chapter: Name or title of the chapter the video belongs to.
310 chapter_number: Number of the chapter the video belongs to, as an integer.
311 chapter_id: Id of the chapter the video belongs to, as a unicode string.
312
313 The following fields should only be used when the video is an episode of some
314 series, programme or podcast:
315
316 series: Title of the series or programme the video episode belongs to.
317 season: Title of the season the video episode belongs to.
318 season_number: Number of the season the video episode belongs to, as an integer.
319 season_id: Id of the season the video episode belongs to, as a unicode string.
320 episode: Title of the video episode. Unlike mandatory video title field,
321 this field should denote the exact title of the video episode
322 without any kind of decoration.
323 episode_number: Number of the video episode within a season, as an integer.
324 episode_id: Id of the video episode, as a unicode string.
325
326 The following fields should only be used when the media is a track or a part of
327 a music album:
328
329 track: Title of the track.
330 track_number: Number of the track within an album or a disc, as an integer.
331 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
332 as a unicode string.
333 artist: Artist(s) of the track.
334 genre: Genre(s) of the track.
335 album: Title of the album the track belongs to.
336 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
337 album_artist: List of all artists appeared on the album (e.g.
338 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
339 and compilations).
340 disc_number: Number of the disc or other physical medium the track belongs to,
341 as an integer.
342 release_year: Year (YYYY) when the album was released.
343
344 Unless mentioned otherwise, the fields should be Unicode strings.
345
346 Unless mentioned otherwise, None is equivalent to absence of information.
347
348
349 _type "playlist" indicates multiple videos.
350 There must be a key "entries", which is a list, an iterable, or a PagedList
351 object, each element of which is a valid dictionary by this specification.
352
353 Additionally, playlists can have "id", "title", and any other relevent
354 attributes with the same semantics as videos (see above).
355
356
357 _type "multi_video" indicates that there are multiple videos that
358 form a single show, for examples multiple acts of an opera or TV episode.
359 It must have an entries key like a playlist and contain all the keys
360 required for a video at the same time.
361
362
363 _type "url" indicates that the video must be extracted from another
364 location, possibly by a different extractor. Its only required key is:
365 "url" - the next URL to extract.
366 The key "ie_key" can be set to the class name (minus the trailing "IE",
367 e.g. "Youtube") if the extractor class is known in advance.
368 Additionally, the dictionary may have any properties of the resolved entity
369 known in advance, for example "title" if the title of the referred video is
370 known ahead of time.
371
372
373 _type "url_transparent" entities have the same specification as "url", but
374 indicate that the given additional information is more precise than the one
375 associated with the resolved URL.
376 This is useful when a site employs a video service that hosts the video and
377 its technical metadata, but that video service does not embed a useful
378 title, description etc.
379
380
381 Subclasses of this one should re-define the _real_initialize() and
382 _real_extract() methods and define a _VALID_URL regexp.
383 Probably, they should also be added to the list of extractors.
384
385 _GEO_BYPASS attribute may be set to False in order to disable
386 geo restriction bypass mechanisms for a particular extractor.
387 Though it won't disable explicit geo restriction bypass based on
388 country code provided with geo_bypass_country.
389
390 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
391 countries for this extractor. One of these countries will be used by
392 geo restriction bypass mechanism right away in order to bypass
393 geo restriction, of course, if the mechanism is not disabled.
394
395 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
396 IP blocks in CIDR notation for this extractor. One of these IP blocks
397 will be used by geo restriction bypass mechanism similarly
398 to _GEO_COUNTRIES.
399
400 Finally, the _WORKING attribute should be set to False for broken IEs
401 in order to warn the users and skip the tests.
402 """
403
404 _ready = False
405 _downloader = None
406 _x_forwarded_for_ip = None
407 _GEO_BYPASS = True
408 _GEO_COUNTRIES = None
409 _GEO_IP_BLOCKS = None
410 _WORKING = True
411
412 def __init__(self, downloader=None):
413 """Constructor. Receives an optional downloader."""
414 self._ready = False
415 self._x_forwarded_for_ip = None
416 self.set_downloader(downloader)
417
418 @classmethod
419 def suitable(cls, url):
420 """Receives a URL and returns True if suitable for this IE."""
421
422 # This does not use has/getattr intentionally - we want to know whether
423 # we have cached the regexp for *this* class, whereas getattr would also
424 # match the superclass
425 if '_VALID_URL_RE' not in cls.__dict__:
426 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
427 return cls._VALID_URL_RE.match(url) is not None
428
429 @classmethod
430 def _match_id(cls, url):
431 if '_VALID_URL_RE' not in cls.__dict__:
432 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
433 m = cls._VALID_URL_RE.match(url)
434 assert m
435 return compat_str(m.group('id'))
436
437 @classmethod
438 def working(cls):
439 """Getter method for _WORKING."""
440 return cls._WORKING
441
442 def initialize(self):
443 """Initializes an instance (authentication, etc)."""
444 self._initialize_geo_bypass({
445 'countries': self._GEO_COUNTRIES,
446 'ip_blocks': self._GEO_IP_BLOCKS,
447 })
448 if not self._ready:
449 self._real_initialize()
450 self._ready = True
451
452 def _initialize_geo_bypass(self, geo_bypass_context):
453 """
454 Initialize geo restriction bypass mechanism.
455
456 This method is used to initialize geo bypass mechanism based on faking
457 X-Forwarded-For HTTP header. A random country from provided country list
458 is selected and a random IP belonging to this country is generated. This
459 IP will be passed as X-Forwarded-For HTTP header in all subsequent
460 HTTP requests.
461
462 This method will be used for initial geo bypass mechanism initialization
463 during the instance initialization with _GEO_COUNTRIES and
464 _GEO_IP_BLOCKS.
465
466 You may also manually call it from extractor's code if geo bypass
467 information is not available beforehand (e.g. obtained during
468 extraction) or due to some other reason. In this case you should pass
469 this information in geo bypass context passed as first argument. It may
470 contain following fields:
471
472 countries: List of geo unrestricted countries (similar
473 to _GEO_COUNTRIES)
474 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
475 (similar to _GEO_IP_BLOCKS)
476
477 """
478 if not self._x_forwarded_for_ip:
479
480 # Geo bypass mechanism is explicitly disabled by user
481 if not self._downloader.params.get('geo_bypass', True):
482 return
483
484 if not geo_bypass_context:
485 geo_bypass_context = {}
486
487 # Backward compatibility: previously _initialize_geo_bypass
488 # expected a list of countries, some 3rd party code may still use
489 # it this way
490 if isinstance(geo_bypass_context, (list, tuple)):
491 geo_bypass_context = {
492 'countries': geo_bypass_context,
493 }
494
495 # The whole point of geo bypass mechanism is to fake IP
496 # as X-Forwarded-For HTTP header based on some IP block or
497 # country code.
498
499 # Path 1: bypassing based on IP block in CIDR notation
500
501 # Explicit IP block specified by user, use it right away
502 # regardless of whether extractor is geo bypassable or not
503 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
504
505 # Otherwise use random IP block from geo bypass context but only
506 # if extractor is known as geo bypassable
507 if not ip_block:
508 ip_blocks = geo_bypass_context.get('ip_blocks')
509 if self._GEO_BYPASS and ip_blocks:
510 ip_block = random.choice(ip_blocks)
511
512 if ip_block:
513 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
514 if self._downloader.params.get('verbose', False):
515 self._downloader.to_screen(
516 '[debug] Using fake IP %s as X-Forwarded-For.'
517 % self._x_forwarded_for_ip)
518 return
519
520 # Path 2: bypassing based on country code
521
522 # Explicit country code specified by user, use it right away
523 # regardless of whether extractor is geo bypassable or not
524 country = self._downloader.params.get('geo_bypass_country', None)
525
526 # Otherwise use random country code from geo bypass context but
527 # only if extractor is known as geo bypassable
528 if not country:
529 countries = geo_bypass_context.get('countries')
530 if self._GEO_BYPASS and countries:
531 country = random.choice(countries)
532
533 if country:
534 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
535 if self._downloader.params.get('verbose', False):
536 self._downloader.to_screen(
537 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
538 % (self._x_forwarded_for_ip, country.upper()))
539
540 def extract(self, url):
541 """Extracts URL information and returns it in list of dicts."""
542 try:
543 for _ in range(2):
544 try:
545 self.initialize()
546 ie_result = self._real_extract(url)
547 if self._x_forwarded_for_ip:
548 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
549 return ie_result
550 except GeoRestrictedError as e:
551 if self.__maybe_fake_ip_and_retry(e.countries):
552 continue
553 raise
554 except ExtractorError:
555 raise
556 except compat_http_client.IncompleteRead as e:
557 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
558 except (KeyError, StopIteration) as e:
559 raise ExtractorError('An extractor error has occurred.', cause=e)
560
561 def __maybe_fake_ip_and_retry(self, countries):
562 if (not self._downloader.params.get('geo_bypass_country', None)
563 and self._GEO_BYPASS
564 and self._downloader.params.get('geo_bypass', True)
565 and not self._x_forwarded_for_ip
566 and countries):
567 country_code = random.choice(countries)
568 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
569 if self._x_forwarded_for_ip:
570 self.report_warning(
571 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
572 % (self._x_forwarded_for_ip, country_code.upper()))
573 return True
574 return False
575
576 def set_downloader(self, downloader):
577 """Sets the downloader for this IE."""
578 self._downloader = downloader
579
580 def _real_initialize(self):
581 """Real initialization process. Redefine in subclasses."""
582 pass
583
584 def _real_extract(self, url):
585 """Real extraction process. Redefine in subclasses."""
586 pass
587
588 @classmethod
589 def ie_key(cls):
590 """A string for getting the InfoExtractor with get_info_extractor"""
591 return compat_str(cls.__name__[:-2])
592
593 @property
594 def IE_NAME(self):
595 return compat_str(type(self).__name__[:-2])
596
597 @staticmethod
598 def __can_accept_status_code(err, expected_status):
599 assert isinstance(err, compat_urllib_error.HTTPError)
600 if expected_status is None:
601 return False
602 if isinstance(expected_status, compat_integer_types):
603 return err.code == expected_status
604 elif isinstance(expected_status, (list, tuple)):
605 return err.code in expected_status
606 elif callable(expected_status):
607 return expected_status(err.code) is True
608 else:
609 assert False
610
611 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
612 """
613 Return the response handle.
614
615 See _download_webpage docstring for arguments specification.
616 """
617 if not self._downloader._first_webpage_request:
618 sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
619 if sleep_interval > 0:
620 self.to_screen('Sleeping %s seconds...' % sleep_interval)
621 time.sleep(sleep_interval)
622 else:
623 self._downloader._first_webpage_request = False
624
625 if note is None:
626 self.report_download_webpage(video_id)
627 elif note is not False:
628 if video_id is None:
629 self.to_screen('%s' % (note,))
630 else:
631 self.to_screen('%s: %s' % (video_id, note))
632
633 # Some sites check X-Forwarded-For HTTP header in order to figure out
634 # the origin of the client behind proxy. This allows bypassing geo
635 # restriction by faking this header's value to IP that belongs to some
636 # geo unrestricted country. We will do so once we encounter any
637 # geo restriction error.
638 if self._x_forwarded_for_ip:
639 if 'X-Forwarded-For' not in headers:
640 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
641
642 if isinstance(url_or_request, compat_urllib_request.Request):
643 url_or_request = update_Request(
644 url_or_request, data=data, headers=headers, query=query)
645 else:
646 if query:
647 url_or_request = update_url_query(url_or_request, query)
648 if data is not None or headers:
649 url_or_request = sanitized_Request(url_or_request, data, headers)
650 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
651 if hasattr(ssl, 'CertificateError'):
652 exceptions.append(ssl.CertificateError)
653 try:
654 return self._downloader.urlopen(url_or_request)
655 except tuple(exceptions) as err:
656 if isinstance(err, compat_urllib_error.HTTPError):
657 if self.__can_accept_status_code(err, expected_status):
658 # Retain reference to error to prevent file object from
659 # being closed before it can be read. Works around the
660 # effects of <https://bugs.python.org/issue15002>
661 # introduced in Python 3.4.1.
662 err.fp._error = err
663 return err.fp
664
665 if errnote is False:
666 return False
667 if errnote is None:
668 errnote = 'Unable to download webpage'
669
670 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
671 if fatal:
672 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
673 else:
674 self._downloader.report_warning(errmsg)
675 return False
676
677 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
678 """
679 Return a tuple (page content as string, URL handle).
680
681 See _download_webpage docstring for arguments specification.
682 """
683 # Strip hashes from the URL (#1038)
684 if isinstance(url_or_request, (compat_str, str)):
685 url_or_request = url_or_request.partition('#')[0]
686
687 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
688 if urlh is False:
689 assert not fatal
690 return False
691 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
692 return (content, urlh)
693
694 @staticmethod
695 def _guess_encoding_from_content(content_type, webpage_bytes):
696 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
697 if m:
698 encoding = m.group(1)
699 else:
700 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
701 webpage_bytes[:1024])
702 if m:
703 encoding = m.group(1).decode('ascii')
704 elif webpage_bytes.startswith(b'\xff\xfe'):
705 encoding = 'utf-16'
706 else:
707 encoding = 'utf-8'
708
709 return encoding
710
711 def __check_blocked(self, content):
712 first_block = content[:512]
713 if ('<title>Access to this site is blocked</title>' in content
714 and 'Websense' in first_block):
715 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
716 blocked_iframe = self._html_search_regex(
717 r'<iframe src="([^"]+)"', content,
718 'Websense information URL', default=None)
719 if blocked_iframe:
720 msg += ' Visit %s for more details' % blocked_iframe
721 raise ExtractorError(msg, expected=True)
722 if '<title>The URL you requested has been blocked</title>' in first_block:
723 msg = (
724 'Access to this webpage has been blocked by Indian censorship. '
725 'Use a VPN or proxy server (with --proxy) to route around it.')
726 block_msg = self._html_search_regex(
727 r'</h1><p>(.*?)</p>',
728 content, 'block message', default=None)
729 if block_msg:
730 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
731 raise ExtractorError(msg, expected=True)
732 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
733 and 'blocklist.rkn.gov.ru' in content):
734 raise ExtractorError(
735 'Access to this webpage has been blocked by decision of the Russian government. '
736 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
737 expected=True)
738
739 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
740 content_type = urlh.headers.get('Content-Type', '')
741 webpage_bytes = urlh.read()
742 if prefix is not None:
743 webpage_bytes = prefix + webpage_bytes
744 if not encoding:
745 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
746 if self._downloader.params.get('dump_intermediate_pages', False):
747 self.to_screen('Dumping request to ' + urlh.geturl())
748 dump = base64.b64encode(webpage_bytes).decode('ascii')
749 self._downloader.to_screen(dump)
750 if self._downloader.params.get('write_pages', False):
751 basen = '%s_%s' % (video_id, urlh.geturl())
752 if len(basen) > 240:
753 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
754 basen = basen[:240 - len(h)] + h
755 raw_filename = basen + '.dump'
756 filename = sanitize_filename(raw_filename, restricted=True)
757 self.to_screen('Saving request to ' + filename)
758 # Working around MAX_PATH limitation on Windows (see
759 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
760 if compat_os_name == 'nt':
761 absfilepath = os.path.abspath(filename)
762 if len(absfilepath) > 259:
763 filename = '\\\\?\\' + absfilepath
764 with open(filename, 'wb') as outf:
765 outf.write(webpage_bytes)
766
767 try:
768 content = webpage_bytes.decode(encoding, 'replace')
769 except LookupError:
770 content = webpage_bytes.decode('utf-8', 'replace')
771
772 self.__check_blocked(content)
773
774 return content
775
776 def _download_webpage(
777 self, url_or_request, video_id, note=None, errnote=None,
778 fatal=True, tries=1, timeout=5, encoding=None, data=None,
779 headers={}, query={}, expected_status=None):
780 """
781 Return the data of the page as a string.
782
783 Arguments:
784 url_or_request -- plain text URL as a string or
785 a compat_urllib_request.Requestobject
786 video_id -- Video/playlist/item identifier (string)
787
788 Keyword arguments:
789 note -- note printed before downloading (string)
790 errnote -- note printed in case of an error (string)
791 fatal -- flag denoting whether error should be considered fatal,
792 i.e. whether it should cause ExtractionError to be raised,
793 otherwise a warning will be reported and extraction continued
794 tries -- number of tries
795 timeout -- sleep interval between tries
796 encoding -- encoding for a page content decoding, guessed automatically
797 when not explicitly specified
798 data -- POST data (bytes)
799 headers -- HTTP headers (dict)
800 query -- URL query (dict)
801 expected_status -- allows to accept failed HTTP requests (non 2xx
802 status code) by explicitly specifying a set of accepted status
803 codes. Can be any of the following entities:
804 - an integer type specifying an exact failed status code to
805 accept
806 - a list or a tuple of integer types specifying a list of
807 failed status codes to accept
808 - a callable accepting an actual failed status code and
809 returning True if it should be accepted
810 Note that this argument does not affect success status codes (2xx)
811 which are always accepted.
812 """
813
814 success = False
815 try_count = 0
816 while success is False:
817 try:
818 res = self._download_webpage_handle(
819 url_or_request, video_id, note, errnote, fatal,
820 encoding=encoding, data=data, headers=headers, query=query,
821 expected_status=expected_status)
822 success = True
823 except compat_http_client.IncompleteRead as e:
824 try_count += 1
825 if try_count >= tries:
826 raise e
827 self._sleep(timeout, video_id)
828 if res is False:
829 return res
830 else:
831 content, _ = res
832 return content
833
834 def _download_xml_handle(
835 self, url_or_request, video_id, note='Downloading XML',
836 errnote='Unable to download XML', transform_source=None,
837 fatal=True, encoding=None, data=None, headers={}, query={},
838 expected_status=None):
839 """
840 Return a tuple (xml as an compat_etree_Element, URL handle).
841
842 See _download_webpage docstring for arguments specification.
843 """
844 res = self._download_webpage_handle(
845 url_or_request, video_id, note, errnote, fatal=fatal,
846 encoding=encoding, data=data, headers=headers, query=query,
847 expected_status=expected_status)
848 if res is False:
849 return res
850 xml_string, urlh = res
851 return self._parse_xml(
852 xml_string, video_id, transform_source=transform_source,
853 fatal=fatal), urlh
854
855 def _download_xml(
856 self, url_or_request, video_id,
857 note='Downloading XML', errnote='Unable to download XML',
858 transform_source=None, fatal=True, encoding=None,
859 data=None, headers={}, query={}, expected_status=None):
860 """
861 Return the xml as an compat_etree_Element.
862
863 See _download_webpage docstring for arguments specification.
864 """
865 res = self._download_xml_handle(
866 url_or_request, video_id, note=note, errnote=errnote,
867 transform_source=transform_source, fatal=fatal, encoding=encoding,
868 data=data, headers=headers, query=query,
869 expected_status=expected_status)
870 return res if res is False else res[0]
871
872 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
873 if transform_source:
874 xml_string = transform_source(xml_string)
875 try:
876 return compat_etree_fromstring(xml_string.encode('utf-8'))
877 except compat_xml_parse_error as ve:
878 errmsg = '%s: Failed to parse XML ' % video_id
879 if fatal:
880 raise ExtractorError(errmsg, cause=ve)
881 else:
882 self.report_warning(errmsg + str(ve))
883
884 def _download_json_handle(
885 self, url_or_request, video_id, note='Downloading JSON metadata',
886 errnote='Unable to download JSON metadata', transform_source=None,
887 fatal=True, encoding=None, data=None, headers={}, query={},
888 expected_status=None):
889 """
890 Return a tuple (JSON object, URL handle).
891
892 See _download_webpage docstring for arguments specification.
893 """
894 res = self._download_webpage_handle(
895 url_or_request, video_id, note, errnote, fatal=fatal,
896 encoding=encoding, data=data, headers=headers, query=query,
897 expected_status=expected_status)
898 if res is False:
899 return res
900 json_string, urlh = res
901 return self._parse_json(
902 json_string, video_id, transform_source=transform_source,
903 fatal=fatal), urlh
904
905 def _download_json(
906 self, url_or_request, video_id, note='Downloading JSON metadata',
907 errnote='Unable to download JSON metadata', transform_source=None,
908 fatal=True, encoding=None, data=None, headers={}, query={},
909 expected_status=None):
910 """
911 Return the JSON object as a dict.
912
913 See _download_webpage docstring for arguments specification.
914 """
915 res = self._download_json_handle(
916 url_or_request, video_id, note=note, errnote=errnote,
917 transform_source=transform_source, fatal=fatal, encoding=encoding,
918 data=data, headers=headers, query=query,
919 expected_status=expected_status)
920 return res if res is False else res[0]
921
922 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
923 if transform_source:
924 json_string = transform_source(json_string)
925 try:
926 return json.loads(json_string)
927 except ValueError as ve:
928 errmsg = '%s: Failed to parse JSON ' % video_id
929 if fatal:
930 raise ExtractorError(errmsg, cause=ve)
931 else:
932 self.report_warning(errmsg + str(ve))
933
934 def report_warning(self, msg, video_id=None):
935 idstr = '' if video_id is None else '%s: ' % video_id
936 self._downloader.report_warning(
937 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
938
939 def to_screen(self, msg):
940 """Print msg to screen, prefixing it with '[ie_name]'"""
941 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
942
943 def report_extraction(self, id_or_name):
944 """Report information extraction."""
945 self.to_screen('%s: Extracting information' % id_or_name)
946
947 def report_download_webpage(self, video_id):
948 """Report webpage download."""
949 self.to_screen('%s: Downloading webpage' % video_id)
950
951 def report_age_confirmation(self):
952 """Report attempt to confirm age."""
953 self.to_screen('Confirming age')
954
955 def report_login(self):
956 """Report attempt to log in."""
957 self.to_screen('Logging in')
958
959 @staticmethod
960 def raise_login_required(msg='This video is only available for registered users'):
961 raise ExtractorError(
962 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
963 expected=True)
964
965 @staticmethod
966 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
967 raise GeoRestrictedError(msg, countries=countries)
968
969 # Methods for following #608
970 @staticmethod
971 def url_result(url, ie=None, video_id=None, video_title=None):
972 """Returns a URL that points to a page that should be processed"""
973 # TODO: ie should be the class used for getting the info
974 video_info = {'_type': 'url',
975 'url': url,
976 'ie_key': ie}
977 if video_id is not None:
978 video_info['id'] = video_id
979 if video_title is not None:
980 video_info['title'] = video_title
981 return video_info
982
983 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
984 urls = orderedSet(
985 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
986 for m in matches)
987 return self.playlist_result(
988 urls, playlist_id=playlist_id, playlist_title=playlist_title)
989
990 @staticmethod
991 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
992 """Returns a playlist"""
993 video_info = {'_type': 'playlist',
994 'entries': entries}
995 video_info.update(kwargs)
996 if playlist_id:
997 video_info['id'] = playlist_id
998 if playlist_title:
999 video_info['title'] = playlist_title
1000 if playlist_description is not None:
1001 video_info['description'] = playlist_description
1002 return video_info
1003
1004 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1005 """
1006 Perform a regex search on the given string, using a single or a list of
1007 patterns returning the first matching group.
1008 In case of failure return a default value or raise a WARNING or a
1009 RegexNotFoundError, depending on fatal, specifying the field name.
1010 """
1011 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1012 mobj = re.search(pattern, string, flags)
1013 else:
1014 for p in pattern:
1015 mobj = re.search(p, string, flags)
1016 if mobj:
1017 break
1018
1019 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1020 _name = '\033[0;34m%s\033[0m' % name
1021 else:
1022 _name = name
1023
1024 if mobj:
1025 if group is None:
1026 # return the first matching group
1027 return next(g for g in mobj.groups() if g is not None)
1028 else:
1029 return mobj.group(group)
1030 elif default is not NO_DEFAULT:
1031 return default
1032 elif fatal:
1033 raise RegexNotFoundError('Unable to extract %s' % _name)
1034 else:
1035 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1036 return None
1037
1038 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1039 """
1040 Like _search_regex, but strips HTML tags and unescapes entities.
1041 """
1042 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1043 if res:
1044 return clean_html(res).strip()
1045 else:
1046 return res
1047
1048 def _get_netrc_login_info(self, netrc_machine=None):
1049 username = None
1050 password = None
1051 netrc_machine = netrc_machine or self._NETRC_MACHINE
1052
1053 if self._downloader.params.get('usenetrc', False):
1054 try:
1055 info = netrc.netrc().authenticators(netrc_machine)
1056 if info is not None:
1057 username = info[0]
1058 password = info[2]
1059 else:
1060 raise netrc.NetrcParseError(
1061 'No authenticators for %s' % netrc_machine)
1062 except (IOError, netrc.NetrcParseError) as err:
1063 self._downloader.report_warning(
1064 'parsing .netrc: %s' % error_to_compat_str(err))
1065
1066 return username, password
1067
1068 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1069 """
1070 Get the login info as (username, password)
1071 First look for the manually specified credentials using username_option
1072 and password_option as keys in params dictionary. If no such credentials
1073 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1074 value.
1075 If there's no info available, return (None, None)
1076 """
1077 if self._downloader is None:
1078 return (None, None)
1079
1080 downloader_params = self._downloader.params
1081
1082 # Attempt to use provided username and password or .netrc data
1083 if downloader_params.get(username_option) is not None:
1084 username = downloader_params[username_option]
1085 password = downloader_params[password_option]
1086 else:
1087 username, password = self._get_netrc_login_info(netrc_machine)
1088
1089 return username, password
1090
1091 def _get_tfa_info(self, note='two-factor verification code'):
1092 """
1093 Get the two-factor authentication info
1094 TODO - asking the user will be required for sms/phone verify
1095 currently just uses the command line option
1096 If there's no info available, return None
1097 """
1098 if self._downloader is None:
1099 return None
1100 downloader_params = self._downloader.params
1101
1102 if downloader_params.get('twofactor') is not None:
1103 return downloader_params['twofactor']
1104
1105 return compat_getpass('Type %s and press [Return]: ' % note)
1106
1107 # Helper functions for extracting OpenGraph info
1108 @staticmethod
1109 def _og_regexes(prop):
1110 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1111 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1112 % {'prop': re.escape(prop)})
1113 template = r'<meta[^>]+?%s[^>]+?%s'
1114 return [
1115 template % (property_re, content_re),
1116 template % (content_re, property_re),
1117 ]
1118
1119 @staticmethod
1120 def _meta_regex(prop):
1121 return r'''(?isx)<meta
1122 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1123 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1124
1125 def _og_search_property(self, prop, html, name=None, **kargs):
1126 if not isinstance(prop, (list, tuple)):
1127 prop = [prop]
1128 if name is None:
1129 name = 'OpenGraph %s' % prop[0]
1130 og_regexes = []
1131 for p in prop:
1132 og_regexes.extend(self._og_regexes(p))
1133 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1134 if escaped is None:
1135 return None
1136 return unescapeHTML(escaped)
1137
1138 def _og_search_thumbnail(self, html, **kargs):
1139 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1140
1141 def _og_search_description(self, html, **kargs):
1142 return self._og_search_property('description', html, fatal=False, **kargs)
1143
1144 def _og_search_title(self, html, **kargs):
1145 return self._og_search_property('title', html, **kargs)
1146
1147 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1148 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1149 if secure:
1150 regexes = self._og_regexes('video:secure_url') + regexes
1151 return self._html_search_regex(regexes, html, name, **kargs)
1152
1153 def _og_search_url(self, html, **kargs):
1154 return self._og_search_property('url', html, **kargs)
1155
1156 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1157 if not isinstance(name, (list, tuple)):
1158 name = [name]
1159 if display_name is None:
1160 display_name = name[0]
1161 return self._html_search_regex(
1162 [self._meta_regex(n) for n in name],
1163 html, display_name, fatal=fatal, group='content', **kwargs)
1164
1165 def _dc_search_uploader(self, html):
1166 return self._html_search_meta('dc.creator', html, 'uploader')
1167
1168 def _rta_search(self, html):
1169 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1170 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1171 r' content="RTA-5042-1996-1400-1577-RTA"',
1172 html):
1173 return 18
1174 return 0
1175
1176 def _media_rating_search(self, html):
1177 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1178 rating = self._html_search_meta('rating', html)
1179
1180 if not rating:
1181 return None
1182
1183 RATING_TABLE = {
1184 'safe for kids': 0,
1185 'general': 8,
1186 '14 years': 14,
1187 'mature': 17,
1188 'restricted': 19,
1189 }
1190 return RATING_TABLE.get(rating.lower())
1191
1192 def _family_friendly_search(self, html):
1193 # See http://schema.org/VideoObject
1194 family_friendly = self._html_search_meta(
1195 'isFamilyFriendly', html, default=None)
1196
1197 if not family_friendly:
1198 return None
1199
1200 RATING_TABLE = {
1201 '1': 0,
1202 'true': 0,
1203 '0': 18,
1204 'false': 18,
1205 }
1206 return RATING_TABLE.get(family_friendly.lower())
1207
1208 def _twitter_search_player(self, html):
1209 return self._html_search_meta('twitter:player', html,
1210 'twitter card player')
1211
1212 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1213 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1214 default = kwargs.get('default', NO_DEFAULT)
1215 # JSON-LD may be malformed and thus `fatal` should be respected.
1216 # At the same time `default` may be passed that assumes `fatal=False`
1217 # for _search_regex. Let's simulate the same behavior here as well.
1218 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1219 json_ld = []
1220 for mobj in json_ld_list:
1221 json_ld_item = self._parse_json(
1222 mobj.group('json_ld'), video_id, fatal=fatal)
1223 if not json_ld_item:
1224 continue
1225 if isinstance(json_ld_item, dict):
1226 json_ld.append(json_ld_item)
1227 elif isinstance(json_ld_item, (list, tuple)):
1228 json_ld.extend(json_ld_item)
1229 if json_ld:
1230 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1231 if json_ld:
1232 return json_ld
1233 if default is not NO_DEFAULT:
1234 return default
1235 elif fatal:
1236 raise RegexNotFoundError('Unable to extract JSON-LD')
1237 else:
1238 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1239 return {}
1240
1241 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1242 if isinstance(json_ld, compat_str):
1243 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1244 if not json_ld:
1245 return {}
1246 info = {}
1247 if not isinstance(json_ld, (list, tuple, dict)):
1248 return info
1249 if isinstance(json_ld, dict):
1250 json_ld = [json_ld]
1251
1252 INTERACTION_TYPE_MAP = {
1253 'CommentAction': 'comment',
1254 'AgreeAction': 'like',
1255 'DisagreeAction': 'dislike',
1256 'LikeAction': 'like',
1257 'DislikeAction': 'dislike',
1258 'ListenAction': 'view',
1259 'WatchAction': 'view',
1260 'ViewAction': 'view',
1261 }
1262
1263 def extract_interaction_type(e):
1264 interaction_type = e.get('interactionType')
1265 if isinstance(interaction_type, dict):
1266 interaction_type = interaction_type.get('@type')
1267 return str_or_none(interaction_type)
1268
1269 def extract_interaction_statistic(e):
1270 interaction_statistic = e.get('interactionStatistic')
1271 if isinstance(interaction_statistic, dict):
1272 interaction_statistic = [interaction_statistic]
1273 if not isinstance(interaction_statistic, list):
1274 return
1275 for is_e in interaction_statistic:
1276 if not isinstance(is_e, dict):
1277 continue
1278 if is_e.get('@type') != 'InteractionCounter':
1279 continue
1280 interaction_type = extract_interaction_type(is_e)
1281 if not interaction_type:
1282 continue
1283 # For interaction count some sites provide string instead of
1284 # an integer (as per spec) with non digit characters (e.g. ",")
1285 # so extracting count with more relaxed str_to_int
1286 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1287 if interaction_count is None:
1288 continue
1289 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1290 if not count_kind:
1291 continue
1292 count_key = '%s_count' % count_kind
1293 if info.get(count_key) is not None:
1294 continue
1295 info[count_key] = interaction_count
1296
1297 def extract_video_object(e):
1298 assert e['@type'] == 'VideoObject'
1299 info.update({
1300 'url': url_or_none(e.get('contentUrl')),
1301 'title': unescapeHTML(e.get('name')),
1302 'description': unescapeHTML(e.get('description')),
1303 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1304 'duration': parse_duration(e.get('duration')),
1305 'timestamp': unified_timestamp(e.get('uploadDate')),
1306 'uploader': str_or_none(e.get('author')),
1307 'filesize': float_or_none(e.get('contentSize')),
1308 'tbr': int_or_none(e.get('bitrate')),
1309 'width': int_or_none(e.get('width')),
1310 'height': int_or_none(e.get('height')),
1311 'view_count': int_or_none(e.get('interactionCount')),
1312 })
1313 extract_interaction_statistic(e)
1314
1315 for e in json_ld:
1316 if '@context' in e:
1317 item_type = e.get('@type')
1318 if expected_type is not None and expected_type != item_type:
1319 continue
1320 if item_type in ('TVEpisode', 'Episode'):
1321 episode_name = unescapeHTML(e.get('name'))
1322 info.update({
1323 'episode': episode_name,
1324 'episode_number': int_or_none(e.get('episodeNumber')),
1325 'description': unescapeHTML(e.get('description')),
1326 })
1327 if not info.get('title') and episode_name:
1328 info['title'] = episode_name
1329 part_of_season = e.get('partOfSeason')
1330 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1331 info.update({
1332 'season': unescapeHTML(part_of_season.get('name')),
1333 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1334 })
1335 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1336 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1337 info['series'] = unescapeHTML(part_of_series.get('name'))
1338 elif item_type == 'Movie':
1339 info.update({
1340 'title': unescapeHTML(e.get('name')),
1341 'description': unescapeHTML(e.get('description')),
1342 'duration': parse_duration(e.get('duration')),
1343 'timestamp': unified_timestamp(e.get('dateCreated')),
1344 })
1345 elif item_type in ('Article', 'NewsArticle'):
1346 info.update({
1347 'timestamp': parse_iso8601(e.get('datePublished')),
1348 'title': unescapeHTML(e.get('headline')),
1349 'description': unescapeHTML(e.get('articleBody')),
1350 })
1351 elif item_type == 'VideoObject':
1352 extract_video_object(e)
1353 if expected_type is None:
1354 continue
1355 else:
1356 break
1357 video = e.get('video')
1358 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1359 extract_video_object(video)
1360 if expected_type is None:
1361 continue
1362 else:
1363 break
1364 return dict((k, v) for k, v in info.items() if v is not None)
1365
1366 @staticmethod
1367 def _hidden_inputs(html):
1368 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1369 hidden_inputs = {}
1370 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1371 attrs = extract_attributes(input)
1372 if not input:
1373 continue
1374 if attrs.get('type') not in ('hidden', 'submit'):
1375 continue
1376 name = attrs.get('name') or attrs.get('id')
1377 value = attrs.get('value')
1378 if name and value is not None:
1379 hidden_inputs[name] = value
1380 return hidden_inputs
1381
1382 def _form_hidden_inputs(self, form_id, html):
1383 form = self._search_regex(
1384 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1385 html, '%s form' % form_id, group='form')
1386 return self._hidden_inputs(form)
1387
1388 class FormatSort:
1389 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1390
1391 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1392 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1393 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
1394
1395 settings = {
1396 'vcodec': {'type': 'ordered', 'regex': True,
1397 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1398 'acodec': {'type': 'ordered', 'regex': True,
1399 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1400 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1401 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1402 'vext': {'type': 'ordered', 'field': 'video_ext',
1403 'order': ('mp4', 'webm', 'flv', '', 'none'),
1404 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1405 'aext': {'type': 'ordered', 'field': 'audio_ext',
1406 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1407 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1408 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1409 'ie_pref': {'priority': True, 'type': 'extractor'},
1410 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1411 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1412 'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1413 'quality': {'convert': 'float_none', 'type': 'extractor'},
1414 'filesize': {'convert': 'bytes'},
1415 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1416 'id': {'convert': 'string', 'field': 'format_id'},
1417 'height': {'convert': 'float_none'},
1418 'width': {'convert': 'float_none'},
1419 'fps': {'convert': 'float_none'},
1420 'tbr': {'convert': 'float_none'},
1421 'vbr': {'convert': 'float_none'},
1422 'abr': {'convert': 'float_none'},
1423 'asr': {'convert': 'float_none'},
1424 'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1425
1426 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1427 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1428 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1429 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1430 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1431
1432 # Most of these exist only for compatibility reasons
1433 'dimension': {'type': 'alias', 'field': 'res'},
1434 'resolution': {'type': 'alias', 'field': 'res'},
1435 'extension': {'type': 'alias', 'field': 'ext'},
1436 'bitrate': {'type': 'alias', 'field': 'br'},
1437 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1438 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1439 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1440 'framerate': {'type': 'alias', 'field': 'fps'},
1441 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1442 'protocol': {'type': 'alias', 'field': 'proto'},
1443 'source_preference': {'type': 'alias', 'field': 'source'},
1444 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1445 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1446 'samplerate': {'type': 'alias', 'field': 'asr'},
1447 'video_ext': {'type': 'alias', 'field': 'vext'},
1448 'audio_ext': {'type': 'alias', 'field': 'aext'},
1449 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1450 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1451 'video': {'type': 'alias', 'field': 'hasvid'},
1452 'has_video': {'type': 'alias', 'field': 'hasvid'},
1453 'audio': {'type': 'alias', 'field': 'hasaud'},
1454 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1455 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1456 'preference': {'type': 'alias', 'field': 'ie_pref'},
1457 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1458 'format_id': {'type': 'alias', 'field': 'id'},
1459 }
1460
1461 _order = []
1462
1463 def _get_field_setting(self, field, key):
1464 if field not in self.settings:
1465 self.settings[field] = {}
1466 propObj = self.settings[field]
1467 if key not in propObj:
1468 type = propObj.get('type')
1469 if key == 'field':
1470 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1471 elif key == 'convert':
1472 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1473 else:
1474 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1475 propObj[key] = default
1476 return propObj[key]
1477
1478 def _resolve_field_value(self, field, value, convertNone=False):
1479 if value is None:
1480 if not convertNone:
1481 return None
1482 else:
1483 value = value.lower()
1484 conversion = self._get_field_setting(field, 'convert')
1485 if conversion == 'ignore':
1486 return None
1487 if conversion == 'string':
1488 return value
1489 elif conversion == 'float_none':
1490 return float_or_none(value)
1491 elif conversion == 'bytes':
1492 return FileDownloader.parse_bytes(value)
1493 elif conversion == 'order':
1494 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1495 use_regex = self._get_field_setting(field, 'regex')
1496 list_length = len(order_list)
1497 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1498 if use_regex and value is not None:
1499 for i, regex in enumerate(order_list):
1500 if regex and re.match(regex, value):
1501 return list_length - i
1502 return list_length - empty_pos # not in list
1503 else: # not regex or value = None
1504 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1505 else:
1506 if value.isnumeric():
1507 return float(value)
1508 else:
1509 self.settings[field]['convert'] = 'string'
1510 return value
1511
1512 def evaluate_params(self, params, sort_extractor):
1513 self._use_free_order = params.get('prefer_free_formats', False)
1514 self._sort_user = params.get('format_sort', [])
1515 self._sort_extractor = sort_extractor
1516
1517 def add_item(field, reverse, closest, limit_text):
1518 field = field.lower()
1519 if field in self._order:
1520 return
1521 self._order.append(field)
1522 limit = self._resolve_field_value(field, limit_text)
1523 data = {
1524 'reverse': reverse,
1525 'closest': False if limit is None else closest,
1526 'limit_text': limit_text,
1527 'limit': limit}
1528 if field in self.settings:
1529 self.settings[field].update(data)
1530 else:
1531 self.settings[field] = data
1532
1533 sort_list = (
1534 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1535 + (tuple() if params.get('format_sort_force', False)
1536 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1537 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1538
1539 for item in sort_list:
1540 match = re.match(self.regex, item)
1541 if match is None:
1542 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1543 field = match.group('field')
1544 if field is None:
1545 continue
1546 if self._get_field_setting(field, 'type') == 'alias':
1547 field = self._get_field_setting(field, 'field')
1548 reverse = match.group('reverse') is not None
1549 closest = match.group('seperator') == '~'
1550 limit_text = match.group('limit')
1551
1552 has_limit = limit_text is not None
1553 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1554 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1555
1556 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1557 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1558 limit_count = len(limits)
1559 for (i, f) in enumerate(fields):
1560 add_item(f, reverse, closest,
1561 limits[i] if i < limit_count
1562 else limits[0] if has_limit and not has_multiple_limits
1563 else None)
1564
1565 def print_verbose_info(self, to_screen):
1566 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1567 if self._sort_extractor:
1568 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1569 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1570 '+' if self._get_field_setting(field, 'reverse') else '', field,
1571 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1572 self._get_field_setting(field, 'limit_text'),
1573 self._get_field_setting(field, 'limit'))
1574 if self._get_field_setting(field, 'limit_text') is not None else '')
1575 for field in self._order if self._get_field_setting(field, 'visible')]))
1576
1577 def _calculate_field_preference_from_value(self, format, field, type, value):
1578 reverse = self._get_field_setting(field, 'reverse')
1579 closest = self._get_field_setting(field, 'closest')
1580 limit = self._get_field_setting(field, 'limit')
1581
1582 if type == 'extractor':
1583 maximum = self._get_field_setting(field, 'max')
1584 if value is None or (maximum is not None and value >= maximum):
1585 value = -1
1586 elif type == 'boolean':
1587 in_list = self._get_field_setting(field, 'in_list')
1588 not_in_list = self._get_field_setting(field, 'not_in_list')
1589 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1590 elif type == 'ordered':
1591 value = self._resolve_field_value(field, value, True)
1592
1593 # try to convert to number
1594 val_num = float_or_none(value)
1595 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1596 if is_num:
1597 value = val_num
1598
1599 return ((-10, 0) if value is None
1600 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1601 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1602 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1603 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1604 else (-1, value, 0))
1605
1606 def _calculate_field_preference(self, format, field):
1607 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1608 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1609 if type == 'multiple':
1610 type = 'field' # Only 'field' is allowed in multiple for now
1611 actual_fields = self._get_field_setting(field, 'field')
1612
1613 def wrapped_function(values):
1614 values = tuple(filter(lambda x: x is not None, values))
1615 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1616 else values[0] if values
1617 else None)
1618
1619 value = wrapped_function((get_value(f) for f in actual_fields))
1620 else:
1621 value = get_value(field)
1622 return self._calculate_field_preference_from_value(format, field, type, value)
1623
1624 def calculate_preference(self, format):
1625 # Determine missing protocol
1626 if not format.get('protocol'):
1627 format['protocol'] = determine_protocol(format)
1628
1629 # Determine missing ext
1630 if not format.get('ext') and 'url' in format:
1631 format['ext'] = determine_ext(format['url'])
1632 if format.get('vcodec') == 'none':
1633 format['audio_ext'] = format['ext']
1634 format['video_ext'] = 'none'
1635 else:
1636 format['video_ext'] = format['ext']
1637 format['audio_ext'] = 'none'
1638 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1639 # format['preference'] = -1000
1640
1641 # Determine missing bitrates
1642 if format.get('tbr') is None:
1643 if format.get('vbr') is not None and format.get('abr') is not None:
1644 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1645 else:
1646 if format.get('vcodec') != "none" and format.get('vbr') is None:
1647 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1648 if format.get('acodec') != "none" and format.get('abr') is None:
1649 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1650
1651 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1652
1653 def _sort_formats(self, formats, field_preference=[]):
1654 if not formats:
1655 raise ExtractorError('No video formats found')
1656 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1657 format_sort.evaluate_params(self._downloader.params, field_preference)
1658 if self._downloader.params.get('verbose', False):
1659 format_sort.print_verbose_info(self._downloader.to_screen)
1660 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1661
1662 def _check_formats(self, formats, video_id):
1663 if formats:
1664 formats[:] = filter(
1665 lambda f: self._is_valid_url(
1666 f['url'], video_id,
1667 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1668 formats)
1669
1670 @staticmethod
1671 def _remove_duplicate_formats(formats):
1672 format_urls = set()
1673 unique_formats = []
1674 for f in formats:
1675 if f['url'] not in format_urls:
1676 format_urls.add(f['url'])
1677 unique_formats.append(f)
1678 formats[:] = unique_formats
1679
1680 def _is_valid_url(self, url, video_id, item='video', headers={}):
1681 url = self._proto_relative_url(url, scheme='http:')
1682 # For now assume non HTTP(S) URLs always valid
1683 if not (url.startswith('http://') or url.startswith('https://')):
1684 return True
1685 try:
1686 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1687 return True
1688 except ExtractorError as e:
1689 self.to_screen(
1690 '%s: %s URL is invalid, skipping: %s'
1691 % (video_id, item, error_to_compat_str(e.cause)))
1692 return False
1693
1694 def http_scheme(self):
1695 """ Either "http:" or "https:", depending on the user's preferences """
1696 return (
1697 'http:'
1698 if self._downloader.params.get('prefer_insecure', False)
1699 else 'https:')
1700
1701 def _proto_relative_url(self, url, scheme=None):
1702 if url is None:
1703 return url
1704 if url.startswith('//'):
1705 if scheme is None:
1706 scheme = self.http_scheme()
1707 return scheme + url
1708 else:
1709 return url
1710
1711 def _sleep(self, timeout, video_id, msg_template=None):
1712 if msg_template is None:
1713 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1714 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1715 self.to_screen(msg)
1716 time.sleep(timeout)
1717
1718 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1719 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1720 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1721 manifest = self._download_xml(
1722 manifest_url, video_id, 'Downloading f4m manifest',
1723 'Unable to download f4m manifest',
1724 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1725 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1726 transform_source=transform_source,
1727 fatal=fatal, data=data, headers=headers, query=query)
1728
1729 if manifest is False:
1730 return []
1731
1732 return self._parse_f4m_formats(
1733 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1734 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1735
1736 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1737 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1738 fatal=True, m3u8_id=None):
1739 if not isinstance(manifest, compat_etree_Element) and not fatal:
1740 return []
1741
1742 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1743 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1744 if akamai_pv is not None and ';' in akamai_pv.text:
1745 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1746 if playerVerificationChallenge.strip() != '':
1747 return []
1748
1749 formats = []
1750 manifest_version = '1.0'
1751 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1752 if not media_nodes:
1753 manifest_version = '2.0'
1754 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1755 # Remove unsupported DRM protected media from final formats
1756 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1757 media_nodes = remove_encrypted_media(media_nodes)
1758 if not media_nodes:
1759 return formats
1760
1761 manifest_base_url = get_base_url(manifest)
1762
1763 bootstrap_info = xpath_element(
1764 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1765 'bootstrap info', default=None)
1766
1767 vcodec = None
1768 mime_type = xpath_text(
1769 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1770 'base URL', default=None)
1771 if mime_type and mime_type.startswith('audio/'):
1772 vcodec = 'none'
1773
1774 for i, media_el in enumerate(media_nodes):
1775 tbr = int_or_none(media_el.attrib.get('bitrate'))
1776 width = int_or_none(media_el.attrib.get('width'))
1777 height = int_or_none(media_el.attrib.get('height'))
1778 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1779 # If <bootstrapInfo> is present, the specified f4m is a
1780 # stream-level manifest, and only set-level manifests may refer to
1781 # external resources. See section 11.4 and section 4 of F4M spec
1782 if bootstrap_info is None:
1783 media_url = None
1784 # @href is introduced in 2.0, see section 11.6 of F4M spec
1785 if manifest_version == '2.0':
1786 media_url = media_el.attrib.get('href')
1787 if media_url is None:
1788 media_url = media_el.attrib.get('url')
1789 if not media_url:
1790 continue
1791 manifest_url = (
1792 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1793 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1794 # If media_url is itself a f4m manifest do the recursive extraction
1795 # since bitrates in parent manifest (this one) and media_url manifest
1796 # may differ leading to inability to resolve the format by requested
1797 # bitrate in f4m downloader
1798 ext = determine_ext(manifest_url)
1799 if ext == 'f4m':
1800 f4m_formats = self._extract_f4m_formats(
1801 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1802 transform_source=transform_source, fatal=fatal)
1803 # Sometimes stream-level manifest contains single media entry that
1804 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1805 # At the same time parent's media entry in set-level manifest may
1806 # contain it. We will copy it from parent in such cases.
1807 if len(f4m_formats) == 1:
1808 f = f4m_formats[0]
1809 f.update({
1810 'tbr': f.get('tbr') or tbr,
1811 'width': f.get('width') or width,
1812 'height': f.get('height') or height,
1813 'format_id': f.get('format_id') if not tbr else format_id,
1814 'vcodec': vcodec,
1815 })
1816 formats.extend(f4m_formats)
1817 continue
1818 elif ext == 'm3u8':
1819 formats.extend(self._extract_m3u8_formats(
1820 manifest_url, video_id, 'mp4', preference=preference,
1821 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1822 continue
1823 formats.append({
1824 'format_id': format_id,
1825 'url': manifest_url,
1826 'manifest_url': manifest_url,
1827 'ext': 'flv' if bootstrap_info is not None else None,
1828 'protocol': 'f4m',
1829 'tbr': tbr,
1830 'width': width,
1831 'height': height,
1832 'vcodec': vcodec,
1833 'preference': preference,
1834 'quality': quality,
1835 })
1836 return formats
1837
1838 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1839 return {
1840 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1841 'url': m3u8_url,
1842 'ext': ext,
1843 'protocol': 'm3u8',
1844 'preference': preference - 100 if preference else -100,
1845 'quality': quality,
1846 'resolution': 'multiple',
1847 'format_note': 'Quality selection URL',
1848 }
1849
1850 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1851 entry_protocol='m3u8', preference=None, quality=None,
1852 m3u8_id=None, live=False, note=None, errnote=None,
1853 fatal=True, data=None, headers={}, query={}):
1854 res = self._download_webpage_handle(
1855 m3u8_url, video_id,
1856 note=note or 'Downloading m3u8 information',
1857 errnote=errnote or 'Failed to download m3u8 information',
1858 fatal=fatal, data=data, headers=headers, query=query)
1859
1860 if res is False:
1861 return []
1862
1863 m3u8_doc, urlh = res
1864 m3u8_url = urlh.geturl()
1865
1866 return self._parse_m3u8_formats(
1867 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1868 preference=preference, quality=quality, m3u8_id=m3u8_id,
1869 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1870 headers=headers, query=query, video_id=video_id)
1871
1872 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1873 entry_protocol='m3u8', preference=None, quality=None,
1874 m3u8_id=None, live=False, note=None, errnote=None,
1875 fatal=True, data=None, headers={}, query={}, video_id=None):
1876 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1877 return []
1878
1879 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1880 return []
1881
1882 formats = []
1883
1884 format_url = lambda u: (
1885 u
1886 if re.match(r'^https?://', u)
1887 else compat_urlparse.urljoin(m3u8_url, u))
1888
1889 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1890
1891 # References:
1892 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1893 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1894 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1895
1896 # We should try extracting formats only from master playlists [1, 4.3.4],
1897 # i.e. playlists that describe available qualities. On the other hand
1898 # media playlists [1, 4.3.3] should be returned as is since they contain
1899 # just the media without qualities renditions.
1900 # Fortunately, master playlist can be easily distinguished from media
1901 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1902 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1903 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1904 # media playlist and MUST NOT appear in master playlist thus we can
1905 # clearly detect media playlist with this criterion.
1906
1907 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
1908 if not m3u8_doc:
1909 if not format_url:
1910 return []
1911 res = self._download_webpage_handle(
1912 format_url, video_id,
1913 note=False,
1914 errnote=errnote or 'Failed to download m3u8 playlist information',
1915 fatal=fatal, data=data, headers=headers, query=query)
1916
1917 if res is False:
1918 return []
1919
1920 m3u8_doc, urlh = res
1921 format_url = urlh.geturl()
1922
1923 playlist_formats = []
1924 i = (
1925 0
1926 if split_discontinuity
1927 else None)
1928 format_info = {
1929 'index': i,
1930 'key_data': None,
1931 'files': [],
1932 }
1933 for line in m3u8_doc.splitlines():
1934 if not line.startswith('#'):
1935 format_info['files'].append(line)
1936 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1937 i += 1
1938 playlist_formats.append(format_info)
1939 format_info = {
1940 'index': i,
1941 'url': format_url,
1942 'files': [],
1943 }
1944 playlist_formats.append(format_info)
1945 return playlist_formats
1946
1947 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1948
1949 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1950
1951 for format in playlist_formats:
1952 format_id = []
1953 if m3u8_id:
1954 format_id.append(m3u8_id)
1955 format_index = format.get('index')
1956 if format_index:
1957 format_id.append(str(format_index))
1958 f = {
1959 'format_id': '-'.join(format_id),
1960 'format_index': format_index,
1961 'url': m3u8_url,
1962 'ext': ext,
1963 'protocol': entry_protocol,
1964 'preference': preference,
1965 'quality': quality,
1966 }
1967 formats.append(f)
1968
1969 return formats
1970
1971 groups = {}
1972 last_stream_inf = {}
1973
1974 def extract_media(x_media_line):
1975 media = parse_m3u8_attributes(x_media_line)
1976 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1977 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1978 if not (media_type and group_id and name):
1979 return
1980 groups.setdefault(group_id, []).append(media)
1981 if media_type not in ('VIDEO', 'AUDIO'):
1982 return
1983 media_url = media.get('URI')
1984 if media_url:
1985 manifest_url = format_url(media_url)
1986 format_id = []
1987 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1988
1989 for format in playlist_formats:
1990 format_index = format.get('index')
1991 for v in (m3u8_id, group_id, name):
1992 if v:
1993 format_id.append(v)
1994 if format_index:
1995 format_id.append(str(format_index))
1996 f = {
1997 'format_id': '-'.join(format_id),
1998 'format_index': format_index,
1999 'url': manifest_url,
2000 'manifest_url': m3u8_url,
2001 'language': media.get('LANGUAGE'),
2002 'ext': ext,
2003 'protocol': entry_protocol,
2004 'preference': preference,
2005 'quality': quality,
2006 }
2007 if media_type == 'AUDIO':
2008 f['vcodec'] = 'none'
2009 formats.append(f)
2010
2011 def build_stream_name():
2012 # Despite specification does not mention NAME attribute for
2013 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2014 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2015 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2016 stream_name = last_stream_inf.get('NAME')
2017 if stream_name:
2018 return stream_name
2019 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2020 # from corresponding rendition group
2021 stream_group_id = last_stream_inf.get('VIDEO')
2022 if not stream_group_id:
2023 return
2024 stream_group = groups.get(stream_group_id)
2025 if not stream_group:
2026 return stream_group_id
2027 rendition = stream_group[0]
2028 return rendition.get('NAME') or stream_group_id
2029
2030 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2031 # chance to detect video only formats when EXT-X-STREAM-INF tags
2032 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2033 for line in m3u8_doc.splitlines():
2034 if line.startswith('#EXT-X-MEDIA:'):
2035 extract_media(line)
2036
2037 for line in m3u8_doc.splitlines():
2038 if line.startswith('#EXT-X-STREAM-INF:'):
2039 last_stream_inf = parse_m3u8_attributes(line)
2040 elif line.startswith('#') or not line.strip():
2041 continue
2042 else:
2043 tbr = float_or_none(
2044 last_stream_inf.get('AVERAGE-BANDWIDTH')
2045 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2046 manifest_url = format_url(line.strip())
2047
2048 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2049
2050 for format in playlist_formats:
2051 format_id = []
2052 if m3u8_id:
2053 format_id.append(m3u8_id)
2054 format_index = format.get('index')
2055 stream_name = build_stream_name()
2056 # Bandwidth of live streams may differ over time thus making
2057 # format_id unpredictable. So it's better to keep provided
2058 # format_id intact.
2059 if not live:
2060 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2061 if format_index:
2062 format_id.append(str(format_index))
2063 f = {
2064 'format_id': '-'.join(format_id),
2065 'format_index': format_index,
2066 'url': manifest_url,
2067 'manifest_url': m3u8_url,
2068 'tbr': tbr,
2069 'ext': ext,
2070 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2071 'protocol': entry_protocol,
2072 'preference': preference,
2073 'quality': quality,
2074 }
2075 resolution = last_stream_inf.get('RESOLUTION')
2076 if resolution:
2077 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2078 if mobj:
2079 f['width'] = int(mobj.group('width'))
2080 f['height'] = int(mobj.group('height'))
2081 # Unified Streaming Platform
2082 mobj = re.search(
2083 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2084 if mobj:
2085 abr, vbr = mobj.groups()
2086 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2087 f.update({
2088 'vbr': vbr,
2089 'abr': abr,
2090 })
2091 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2092 f.update(codecs)
2093 audio_group_id = last_stream_inf.get('AUDIO')
2094 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2095 # references a rendition group MUST have a CODECS attribute.
2096 # However, this is not always respected, for example, [2]
2097 # contains EXT-X-STREAM-INF tag which references AUDIO
2098 # rendition group but does not have CODECS and despite
2099 # referencing an audio group it represents a complete
2100 # (with audio and video) format. So, for such cases we will
2101 # ignore references to rendition groups and treat them
2102 # as complete formats.
2103 if audio_group_id and codecs and f.get('vcodec') != 'none':
2104 audio_group = groups.get(audio_group_id)
2105 if audio_group and audio_group[0].get('URI'):
2106 # TODO: update acodec for audio only formats with
2107 # the same GROUP-ID
2108 f['acodec'] = 'none'
2109 formats.append(f)
2110
2111 # for DailyMotion
2112 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2113 if progressive_uri:
2114 http_f = f.copy()
2115 del http_f['manifest_url']
2116 http_f.update({
2117 'format_id': f['format_id'].replace('hls-', 'http-'),
2118 'protocol': 'http',
2119 'url': progressive_uri,
2120 })
2121 formats.append(http_f)
2122
2123 last_stream_inf = {}
2124 return formats
2125
2126 @staticmethod
2127 def _xpath_ns(path, namespace=None):
2128 if not namespace:
2129 return path
2130 out = []
2131 for c in path.split('/'):
2132 if not c or c == '.':
2133 out.append(c)
2134 else:
2135 out.append('{%s}%s' % (namespace, c))
2136 return '/'.join(out)
2137
2138 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2139 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2140
2141 if smil is False:
2142 assert not fatal
2143 return []
2144
2145 namespace = self._parse_smil_namespace(smil)
2146
2147 return self._parse_smil_formats(
2148 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2149
2150 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2151 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2152 if smil is False:
2153 return {}
2154 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2155
2156 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2157 return self._download_xml(
2158 smil_url, video_id, 'Downloading SMIL file',
2159 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2160
2161 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2162 namespace = self._parse_smil_namespace(smil)
2163
2164 formats = self._parse_smil_formats(
2165 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2166 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2167
2168 video_id = os.path.splitext(url_basename(smil_url))[0]
2169 title = None
2170 description = None
2171 upload_date = None
2172 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2173 name = meta.attrib.get('name')
2174 content = meta.attrib.get('content')
2175 if not name or not content:
2176 continue
2177 if not title and name == 'title':
2178 title = content
2179 elif not description and name in ('description', 'abstract'):
2180 description = content
2181 elif not upload_date and name == 'date':
2182 upload_date = unified_strdate(content)
2183
2184 thumbnails = [{
2185 'id': image.get('type'),
2186 'url': image.get('src'),
2187 'width': int_or_none(image.get('width')),
2188 'height': int_or_none(image.get('height')),
2189 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2190
2191 return {
2192 'id': video_id,
2193 'title': title or video_id,
2194 'description': description,
2195 'upload_date': upload_date,
2196 'thumbnails': thumbnails,
2197 'formats': formats,
2198 'subtitles': subtitles,
2199 }
2200
2201 def _parse_smil_namespace(self, smil):
2202 return self._search_regex(
2203 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2204
2205 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2206 base = smil_url
2207 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2208 b = meta.get('base') or meta.get('httpBase')
2209 if b:
2210 base = b
2211 break
2212
2213 formats = []
2214 rtmp_count = 0
2215 http_count = 0
2216 m3u8_count = 0
2217
2218 srcs = []
2219 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2220 for medium in media:
2221 src = medium.get('src')
2222 if not src or src in srcs:
2223 continue
2224 srcs.append(src)
2225
2226 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2227 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2228 width = int_or_none(medium.get('width'))
2229 height = int_or_none(medium.get('height'))
2230 proto = medium.get('proto')
2231 ext = medium.get('ext')
2232 src_ext = determine_ext(src)
2233 streamer = medium.get('streamer') or base
2234
2235 if proto == 'rtmp' or streamer.startswith('rtmp'):
2236 rtmp_count += 1
2237 formats.append({
2238 'url': streamer,
2239 'play_path': src,
2240 'ext': 'flv',
2241 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2242 'tbr': bitrate,
2243 'filesize': filesize,
2244 'width': width,
2245 'height': height,
2246 })
2247 if transform_rtmp_url:
2248 streamer, src = transform_rtmp_url(streamer, src)
2249 formats[-1].update({
2250 'url': streamer,
2251 'play_path': src,
2252 })
2253 continue
2254
2255 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2256 src_url = src_url.strip()
2257
2258 if proto == 'm3u8' or src_ext == 'm3u8':
2259 m3u8_formats = self._extract_m3u8_formats(
2260 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2261 if len(m3u8_formats) == 1:
2262 m3u8_count += 1
2263 m3u8_formats[0].update({
2264 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2265 'tbr': bitrate,
2266 'width': width,
2267 'height': height,
2268 })
2269 formats.extend(m3u8_formats)
2270 elif src_ext == 'f4m':
2271 f4m_url = src_url
2272 if not f4m_params:
2273 f4m_params = {
2274 'hdcore': '3.2.0',
2275 'plugin': 'flowplayer-3.2.0.1',
2276 }
2277 f4m_url += '&' if '?' in f4m_url else '?'
2278 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2279 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2280 elif src_ext == 'mpd':
2281 formats.extend(self._extract_mpd_formats(
2282 src_url, video_id, mpd_id='dash', fatal=False))
2283 elif re.search(r'\.ism/[Mm]anifest', src_url):
2284 formats.extend(self._extract_ism_formats(
2285 src_url, video_id, ism_id='mss', fatal=False))
2286 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2287 http_count += 1
2288 formats.append({
2289 'url': src_url,
2290 'ext': ext or src_ext or 'flv',
2291 'format_id': 'http-%d' % (bitrate or http_count),
2292 'tbr': bitrate,
2293 'filesize': filesize,
2294 'width': width,
2295 'height': height,
2296 })
2297
2298 return formats
2299
2300 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2301 urls = []
2302 subtitles = {}
2303 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2304 src = textstream.get('src')
2305 if not src or src in urls:
2306 continue
2307 urls.append(src)
2308 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2309 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2310 subtitles.setdefault(lang, []).append({
2311 'url': src,
2312 'ext': ext,
2313 })
2314 return subtitles
2315
2316 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2317 xspf = self._download_xml(
2318 xspf_url, playlist_id, 'Downloading xpsf playlist',
2319 'Unable to download xspf manifest', fatal=fatal)
2320 if xspf is False:
2321 return []
2322 return self._parse_xspf(
2323 xspf, playlist_id, xspf_url=xspf_url,
2324 xspf_base_url=base_url(xspf_url))
2325
2326 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2327 NS_MAP = {
2328 'xspf': 'http://xspf.org/ns/0/',
2329 's1': 'http://static.streamone.nl/player/ns/0',
2330 }
2331
2332 entries = []
2333 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2334 title = xpath_text(
2335 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2336 description = xpath_text(
2337 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2338 thumbnail = xpath_text(
2339 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2340 duration = float_or_none(
2341 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2342
2343 formats = []
2344 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2345 format_url = urljoin(xspf_base_url, location.text)
2346 if not format_url:
2347 continue
2348 formats.append({
2349 'url': format_url,
2350 'manifest_url': xspf_url,
2351 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2352 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2353 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2354 })
2355 self._sort_formats(formats)
2356
2357 entries.append({
2358 'id': playlist_id,
2359 'title': title,
2360 'description': description,
2361 'thumbnail': thumbnail,
2362 'duration': duration,
2363 'formats': formats,
2364 })
2365 return entries
2366
2367 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2368 res = self._download_xml_handle(
2369 mpd_url, video_id,
2370 note=note or 'Downloading MPD manifest',
2371 errnote=errnote or 'Failed to download MPD manifest',
2372 fatal=fatal, data=data, headers=headers, query=query)
2373 if res is False:
2374 return []
2375 mpd_doc, urlh = res
2376 if mpd_doc is None:
2377 return []
2378 mpd_base_url = base_url(urlh.geturl())
2379
2380 return self._parse_mpd_formats(
2381 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2382
2383 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2384 """
2385 Parse formats from MPD manifest.
2386 References:
2387 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2388 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2389 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2390 """
2391 if not self._downloader.params.get('dynamic_mpd'):
2392 if mpd_doc.get('type') == 'dynamic':
2393 return []
2394
2395 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2396
2397 def _add_ns(path):
2398 return self._xpath_ns(path, namespace)
2399
2400 def is_drm_protected(element):
2401 return element.find(_add_ns('ContentProtection')) is not None
2402
2403 def extract_multisegment_info(element, ms_parent_info):
2404 ms_info = ms_parent_info.copy()
2405
2406 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2407 # common attributes and elements. We will only extract relevant
2408 # for us.
2409 def extract_common(source):
2410 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2411 if segment_timeline is not None:
2412 s_e = segment_timeline.findall(_add_ns('S'))
2413 if s_e:
2414 ms_info['total_number'] = 0
2415 ms_info['s'] = []
2416 for s in s_e:
2417 r = int(s.get('r', 0))
2418 ms_info['total_number'] += 1 + r
2419 ms_info['s'].append({
2420 't': int(s.get('t', 0)),
2421 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2422 'd': int(s.attrib['d']),
2423 'r': r,
2424 })
2425 start_number = source.get('startNumber')
2426 if start_number:
2427 ms_info['start_number'] = int(start_number)
2428 timescale = source.get('timescale')
2429 if timescale:
2430 ms_info['timescale'] = int(timescale)
2431 segment_duration = source.get('duration')
2432 if segment_duration:
2433 ms_info['segment_duration'] = float(segment_duration)
2434
2435 def extract_Initialization(source):
2436 initialization = source.find(_add_ns('Initialization'))
2437 if initialization is not None:
2438 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2439
2440 segment_list = element.find(_add_ns('SegmentList'))
2441 if segment_list is not None:
2442 extract_common(segment_list)
2443 extract_Initialization(segment_list)
2444 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2445 if segment_urls_e:
2446 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2447 else:
2448 segment_template = element.find(_add_ns('SegmentTemplate'))
2449 if segment_template is not None:
2450 extract_common(segment_template)
2451 media = segment_template.get('media')
2452 if media:
2453 ms_info['media'] = media
2454 initialization = segment_template.get('initialization')
2455 if initialization:
2456 ms_info['initialization'] = initialization
2457 else:
2458 extract_Initialization(segment_template)
2459 return ms_info
2460
2461 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2462
2463 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2464 formats = []
2465 for period in mpd_doc.findall(_add_ns('Period')):
2466 period_duration = parse_duration(period.get('duration')) or mpd_duration
2467 period_ms_info = extract_multisegment_info(period, {
2468 'start_number': 1,
2469 'timescale': 1,
2470 })
2471 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2472 if skip_unplayable and is_drm_protected(adaptation_set):
2473 continue
2474 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2475 for representation in adaptation_set.findall(_add_ns('Representation')):
2476 if skip_unplayable and is_drm_protected(representation):
2477 continue
2478 representation_attrib = adaptation_set.attrib.copy()
2479 representation_attrib.update(representation.attrib)
2480 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2481 mime_type = representation_attrib['mimeType']
2482 content_type = mime_type.split('/')[0]
2483 if content_type == 'text':
2484 # TODO implement WebVTT downloading
2485 pass
2486 elif content_type in ('video', 'audio'):
2487 base_url = ''
2488 for element in (representation, adaptation_set, period, mpd_doc):
2489 base_url_e = element.find(_add_ns('BaseURL'))
2490 if base_url_e is not None:
2491 base_url = base_url_e.text + base_url
2492 if re.match(r'^https?://', base_url):
2493 break
2494 if mpd_base_url and not re.match(r'^https?://', base_url):
2495 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2496 mpd_base_url += '/'
2497 base_url = mpd_base_url + base_url
2498 representation_id = representation_attrib.get('id')
2499 lang = representation_attrib.get('lang')
2500 url_el = representation.find(_add_ns('BaseURL'))
2501 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2502 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2503 f = {
2504 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2505 'manifest_url': mpd_url,
2506 'ext': mimetype2ext(mime_type),
2507 'width': int_or_none(representation_attrib.get('width')),
2508 'height': int_or_none(representation_attrib.get('height')),
2509 'tbr': float_or_none(bandwidth, 1000),
2510 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2511 'fps': int_or_none(representation_attrib.get('frameRate')),
2512 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2513 'format_note': 'DASH %s' % content_type,
2514 'filesize': filesize,
2515 'container': mimetype2ext(mime_type) + '_dash',
2516 }
2517 f.update(parse_codecs(representation_attrib.get('codecs')))
2518 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2519
2520 def prepare_template(template_name, identifiers):
2521 tmpl = representation_ms_info[template_name]
2522 # First of, % characters outside $...$ templates
2523 # must be escaped by doubling for proper processing
2524 # by % operator string formatting used further (see
2525 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2526 t = ''
2527 in_template = False
2528 for c in tmpl:
2529 t += c
2530 if c == '$':
2531 in_template = not in_template
2532 elif c == '%' and not in_template:
2533 t += c
2534 # Next, $...$ templates are translated to their
2535 # %(...) counterparts to be used with % operator
2536 t = t.replace('$RepresentationID$', representation_id)
2537 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2538 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2539 t.replace('$$', '$')
2540 return t
2541
2542 # @initialization is a regular template like @media one
2543 # so it should be handled just the same way (see
2544 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2545 if 'initialization' in representation_ms_info:
2546 initialization_template = prepare_template(
2547 'initialization',
2548 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2549 # $Time$ shall not be included for @initialization thus
2550 # only $Bandwidth$ remains
2551 ('Bandwidth', ))
2552 representation_ms_info['initialization_url'] = initialization_template % {
2553 'Bandwidth': bandwidth,
2554 }
2555
2556 def location_key(location):
2557 return 'url' if re.match(r'^https?://', location) else 'path'
2558
2559 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2560
2561 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2562 media_location_key = location_key(media_template)
2563
2564 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2565 # can't be used at the same time
2566 if '%(Number' in media_template and 's' not in representation_ms_info:
2567 segment_duration = None
2568 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2569 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2570 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2571 representation_ms_info['fragments'] = [{
2572 media_location_key: media_template % {
2573 'Number': segment_number,
2574 'Bandwidth': bandwidth,
2575 },
2576 'duration': segment_duration,
2577 } for segment_number in range(
2578 representation_ms_info['start_number'],
2579 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2580 else:
2581 # $Number*$ or $Time$ in media template with S list available
2582 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2583 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2584 representation_ms_info['fragments'] = []
2585 segment_time = 0
2586 segment_d = None
2587 segment_number = representation_ms_info['start_number']
2588
2589 def add_segment_url():
2590 segment_url = media_template % {
2591 'Time': segment_time,
2592 'Bandwidth': bandwidth,
2593 'Number': segment_number,
2594 }
2595 representation_ms_info['fragments'].append({
2596 media_location_key: segment_url,
2597 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2598 })
2599
2600 for num, s in enumerate(representation_ms_info['s']):
2601 segment_time = s.get('t') or segment_time
2602 segment_d = s['d']
2603 add_segment_url()
2604 segment_number += 1
2605 for r in range(s.get('r', 0)):
2606 segment_time += segment_d
2607 add_segment_url()
2608 segment_number += 1
2609 segment_time += segment_d
2610 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2611 # No media template
2612 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2613 # or any YouTube dashsegments video
2614 fragments = []
2615 segment_index = 0
2616 timescale = representation_ms_info['timescale']
2617 for s in representation_ms_info['s']:
2618 duration = float_or_none(s['d'], timescale)
2619 for r in range(s.get('r', 0) + 1):
2620 segment_uri = representation_ms_info['segment_urls'][segment_index]
2621 fragments.append({
2622 location_key(segment_uri): segment_uri,
2623 'duration': duration,
2624 })
2625 segment_index += 1
2626 representation_ms_info['fragments'] = fragments
2627 elif 'segment_urls' in representation_ms_info:
2628 # Segment URLs with no SegmentTimeline
2629 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2630 # https://github.com/ytdl-org/youtube-dl/pull/14844
2631 fragments = []
2632 segment_duration = float_or_none(
2633 representation_ms_info['segment_duration'],
2634 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2635 for segment_url in representation_ms_info['segment_urls']:
2636 fragment = {
2637 location_key(segment_url): segment_url,
2638 }
2639 if segment_duration:
2640 fragment['duration'] = segment_duration
2641 fragments.append(fragment)
2642 representation_ms_info['fragments'] = fragments
2643 # If there is a fragments key available then we correctly recognized fragmented media.
2644 # Otherwise we will assume unfragmented media with direct access. Technically, such
2645 # assumption is not necessarily correct since we may simply have no support for
2646 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2647 if 'fragments' in representation_ms_info:
2648 f.update({
2649 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2650 'url': mpd_url or base_url,
2651 'fragment_base_url': base_url,
2652 'fragments': [],
2653 'protocol': 'http_dash_segments',
2654 })
2655 if 'initialization_url' in representation_ms_info:
2656 initialization_url = representation_ms_info['initialization_url']
2657 if not f.get('url'):
2658 f['url'] = initialization_url
2659 f['fragments'].append({location_key(initialization_url): initialization_url})
2660 f['fragments'].extend(representation_ms_info['fragments'])
2661 else:
2662 # Assuming direct URL to unfragmented media.
2663 f['url'] = base_url
2664 formats.append(f)
2665 else:
2666 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2667 return formats
2668
2669 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2670 res = self._download_xml_handle(
2671 ism_url, video_id,
2672 note=note or 'Downloading ISM manifest',
2673 errnote=errnote or 'Failed to download ISM manifest',
2674 fatal=fatal, data=data, headers=headers, query=query)
2675 if res is False:
2676 return []
2677 ism_doc, urlh = res
2678 if ism_doc is None:
2679 return []
2680
2681 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2682
2683 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2684 """
2685 Parse formats from ISM manifest.
2686 References:
2687 1. [MS-SSTR]: Smooth Streaming Protocol,
2688 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2689 """
2690 if ism_doc.get('IsLive') == 'TRUE':
2691 return []
2692 if (not self._downloader.params.get('allow_unplayable_formats')
2693 and ism_doc.find('Protection') is not None):
2694 return []
2695
2696 duration = int(ism_doc.attrib['Duration'])
2697 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2698
2699 formats = []
2700 for stream in ism_doc.findall('StreamIndex'):
2701 stream_type = stream.get('Type')
2702 if stream_type not in ('video', 'audio'):
2703 continue
2704 url_pattern = stream.attrib['Url']
2705 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2706 stream_name = stream.get('Name')
2707 for track in stream.findall('QualityLevel'):
2708 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2709 # TODO: add support for WVC1 and WMAP
2710 if fourcc not in ('H264', 'AVC1', 'AACL'):
2711 self.report_warning('%s is not a supported codec' % fourcc)
2712 continue
2713 tbr = int(track.attrib['Bitrate']) // 1000
2714 # [1] does not mention Width and Height attributes. However,
2715 # they're often present while MaxWidth and MaxHeight are
2716 # missing, so should be used as fallbacks
2717 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2718 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2719 sampling_rate = int_or_none(track.get('SamplingRate'))
2720
2721 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2722 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2723
2724 fragments = []
2725 fragment_ctx = {
2726 'time': 0,
2727 }
2728 stream_fragments = stream.findall('c')
2729 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2730 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2731 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2732 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2733 if not fragment_ctx['duration']:
2734 try:
2735 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2736 except IndexError:
2737 next_fragment_time = duration
2738 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2739 for _ in range(fragment_repeat):
2740 fragments.append({
2741 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2742 'duration': fragment_ctx['duration'] / stream_timescale,
2743 })
2744 fragment_ctx['time'] += fragment_ctx['duration']
2745
2746 format_id = []
2747 if ism_id:
2748 format_id.append(ism_id)
2749 if stream_name:
2750 format_id.append(stream_name)
2751 format_id.append(compat_str(tbr))
2752
2753 formats.append({
2754 'format_id': '-'.join(format_id),
2755 'url': ism_url,
2756 'manifest_url': ism_url,
2757 'ext': 'ismv' if stream_type == 'video' else 'isma',
2758 'width': width,
2759 'height': height,
2760 'tbr': tbr,
2761 'asr': sampling_rate,
2762 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2763 'acodec': 'none' if stream_type == 'video' else fourcc,
2764 'protocol': 'ism',
2765 'fragments': fragments,
2766 '_download_params': {
2767 'duration': duration,
2768 'timescale': stream_timescale,
2769 'width': width or 0,
2770 'height': height or 0,
2771 'fourcc': fourcc,
2772 'codec_private_data': track.get('CodecPrivateData'),
2773 'sampling_rate': sampling_rate,
2774 'channels': int_or_none(track.get('Channels', 2)),
2775 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2776 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2777 },
2778 })
2779 return formats
2780
2781 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2782 def absolute_url(item_url):
2783 return urljoin(base_url, item_url)
2784
2785 def parse_content_type(content_type):
2786 if not content_type:
2787 return {}
2788 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2789 if ctr:
2790 mimetype, codecs = ctr.groups()
2791 f = parse_codecs(codecs)
2792 f['ext'] = mimetype2ext(mimetype)
2793 return f
2794 return {}
2795
2796 def _media_formats(src, cur_media_type, type_info={}):
2797 full_url = absolute_url(src)
2798 ext = type_info.get('ext') or determine_ext(full_url)
2799 if ext == 'm3u8':
2800 is_plain_url = False
2801 formats = self._extract_m3u8_formats(
2802 full_url, video_id, ext='mp4',
2803 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2804 preference=preference, quality=quality, fatal=False)
2805 elif ext == 'mpd':
2806 is_plain_url = False
2807 formats = self._extract_mpd_formats(
2808 full_url, video_id, mpd_id=mpd_id, fatal=False)
2809 else:
2810 is_plain_url = True
2811 formats = [{
2812 'url': full_url,
2813 'vcodec': 'none' if cur_media_type == 'audio' else None,
2814 }]
2815 return is_plain_url, formats
2816
2817 entries = []
2818 # amp-video and amp-audio are very similar to their HTML5 counterparts
2819 # so we wll include them right here (see
2820 # https://www.ampproject.org/docs/reference/components/amp-video)
2821 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2822 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2823 media_tags = [(media_tag, media_tag_name, media_type, '')
2824 for media_tag, media_tag_name, media_type
2825 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2826 media_tags.extend(re.findall(
2827 # We only allow video|audio followed by a whitespace or '>'.
2828 # Allowing more characters may end up in significant slow down (see
2829 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2830 # http://www.porntrex.com/maps/videositemap.xml).
2831 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2832 for media_tag, _, media_type, media_content in media_tags:
2833 media_info = {
2834 'formats': [],
2835 'subtitles': {},
2836 }
2837 media_attributes = extract_attributes(media_tag)
2838 src = strip_or_none(media_attributes.get('src'))
2839 if src:
2840 _, formats = _media_formats(src, media_type)
2841 media_info['formats'].extend(formats)
2842 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2843 if media_content:
2844 for source_tag in re.findall(r'<source[^>]+>', media_content):
2845 s_attr = extract_attributes(source_tag)
2846 # data-video-src and data-src are non standard but seen
2847 # several times in the wild
2848 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2849 if not src:
2850 continue
2851 f = parse_content_type(s_attr.get('type'))
2852 is_plain_url, formats = _media_formats(src, media_type, f)
2853 if is_plain_url:
2854 # width, height, res, label and title attributes are
2855 # all not standard but seen several times in the wild
2856 labels = [
2857 s_attr.get(lbl)
2858 for lbl in ('label', 'title')
2859 if str_or_none(s_attr.get(lbl))
2860 ]
2861 width = int_or_none(s_attr.get('width'))
2862 height = (int_or_none(s_attr.get('height'))
2863 or int_or_none(s_attr.get('res')))
2864 if not width or not height:
2865 for lbl in labels:
2866 resolution = parse_resolution(lbl)
2867 if not resolution:
2868 continue
2869 width = width or resolution.get('width')
2870 height = height or resolution.get('height')
2871 for lbl in labels:
2872 tbr = parse_bitrate(lbl)
2873 if tbr:
2874 break
2875 else:
2876 tbr = None
2877 f.update({
2878 'width': width,
2879 'height': height,
2880 'tbr': tbr,
2881 'format_id': s_attr.get('label') or s_attr.get('title'),
2882 })
2883 f.update(formats[0])
2884 media_info['formats'].append(f)
2885 else:
2886 media_info['formats'].extend(formats)
2887 for track_tag in re.findall(r'<track[^>]+>', media_content):
2888 track_attributes = extract_attributes(track_tag)
2889 kind = track_attributes.get('kind')
2890 if not kind or kind in ('subtitles', 'captions'):
2891 src = strip_or_none(track_attributes.get('src'))
2892 if not src:
2893 continue
2894 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2895 media_info['subtitles'].setdefault(lang, []).append({
2896 'url': absolute_url(src),
2897 })
2898 for f in media_info['formats']:
2899 f.setdefault('http_headers', {})['Referer'] = base_url
2900 if media_info['formats'] or media_info['subtitles']:
2901 entries.append(media_info)
2902 return entries
2903
2904 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2905 signed = 'hdnea=' in manifest_url
2906 if not signed:
2907 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2908 manifest_url = re.sub(
2909 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2910 '', manifest_url).strip('?')
2911
2912 formats = []
2913
2914 hdcore_sign = 'hdcore=3.7.0'
2915 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2916 hds_host = hosts.get('hds')
2917 if hds_host:
2918 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2919 if 'hdcore=' not in f4m_url:
2920 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2921 f4m_formats = self._extract_f4m_formats(
2922 f4m_url, video_id, f4m_id='hds', fatal=False)
2923 for entry in f4m_formats:
2924 entry.update({'extra_param_to_segment_url': hdcore_sign})
2925 formats.extend(f4m_formats)
2926
2927 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2928 hls_host = hosts.get('hls')
2929 if hls_host:
2930 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2931 m3u8_formats = self._extract_m3u8_formats(
2932 m3u8_url, video_id, 'mp4', 'm3u8_native',
2933 m3u8_id='hls', fatal=False)
2934 formats.extend(m3u8_formats)
2935
2936 http_host = hosts.get('http')
2937 if http_host and m3u8_formats and not signed:
2938 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2939 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2940 qualities_length = len(qualities)
2941 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2942 i = 0
2943 for f in m3u8_formats:
2944 if f['vcodec'] != 'none':
2945 for protocol in ('http', 'https'):
2946 http_f = f.copy()
2947 del http_f['manifest_url']
2948 http_url = re.sub(
2949 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2950 http_f.update({
2951 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2952 'url': http_url,
2953 'protocol': protocol,
2954 })
2955 formats.append(http_f)
2956 i += 1
2957
2958 return formats
2959
2960 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2961 query = compat_urlparse.urlparse(url).query
2962 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2963 mobj = re.search(
2964 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2965 url_base = mobj.group('url')
2966 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2967 formats = []
2968
2969 def manifest_url(manifest):
2970 m_url = '%s/%s' % (http_base_url, manifest)
2971 if query:
2972 m_url += '?%s' % query
2973 return m_url
2974
2975 if 'm3u8' not in skip_protocols:
2976 formats.extend(self._extract_m3u8_formats(
2977 manifest_url('playlist.m3u8'), video_id, 'mp4',
2978 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2979 if 'f4m' not in skip_protocols:
2980 formats.extend(self._extract_f4m_formats(
2981 manifest_url('manifest.f4m'),
2982 video_id, f4m_id='hds', fatal=False))
2983 if 'dash' not in skip_protocols:
2984 formats.extend(self._extract_mpd_formats(
2985 manifest_url('manifest.mpd'),
2986 video_id, mpd_id='dash', fatal=False))
2987 if re.search(r'(?:/smil:|\.smil)', url_base):
2988 if 'smil' not in skip_protocols:
2989 rtmp_formats = self._extract_smil_formats(
2990 manifest_url('jwplayer.smil'),
2991 video_id, fatal=False)
2992 for rtmp_format in rtmp_formats:
2993 rtsp_format = rtmp_format.copy()
2994 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2995 del rtsp_format['play_path']
2996 del rtsp_format['ext']
2997 rtsp_format.update({
2998 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2999 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3000 'protocol': 'rtsp',
3001 })
3002 formats.extend([rtmp_format, rtsp_format])
3003 else:
3004 for protocol in ('rtmp', 'rtsp'):
3005 if protocol not in skip_protocols:
3006 formats.append({
3007 'url': '%s:%s' % (protocol, url_base),
3008 'format_id': protocol,
3009 'protocol': protocol,
3010 })
3011 return formats
3012
3013 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3014 mobj = re.search(
3015 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3016 webpage)
3017 if mobj:
3018 try:
3019 jwplayer_data = self._parse_json(mobj.group('options'),
3020 video_id=video_id,
3021 transform_source=transform_source)
3022 except ExtractorError:
3023 pass
3024 else:
3025 if isinstance(jwplayer_data, dict):
3026 return jwplayer_data
3027
3028 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3029 jwplayer_data = self._find_jwplayer_data(
3030 webpage, video_id, transform_source=js_to_json)
3031 return self._parse_jwplayer_data(
3032 jwplayer_data, video_id, *args, **kwargs)
3033
3034 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3035 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3036 # JWPlayer backward compatibility: flattened playlists
3037 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3038 if 'playlist' not in jwplayer_data:
3039 jwplayer_data = {'playlist': [jwplayer_data]}
3040
3041 entries = []
3042
3043 # JWPlayer backward compatibility: single playlist item
3044 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3045 if not isinstance(jwplayer_data['playlist'], list):
3046 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3047
3048 for video_data in jwplayer_data['playlist']:
3049 # JWPlayer backward compatibility: flattened sources
3050 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3051 if 'sources' not in video_data:
3052 video_data['sources'] = [video_data]
3053
3054 this_video_id = video_id or video_data['mediaid']
3055
3056 formats = self._parse_jwplayer_formats(
3057 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3058 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3059
3060 subtitles = {}
3061 tracks = video_data.get('tracks')
3062 if tracks and isinstance(tracks, list):
3063 for track in tracks:
3064 if not isinstance(track, dict):
3065 continue
3066 track_kind = track.get('kind')
3067 if not track_kind or not isinstance(track_kind, compat_str):
3068 continue
3069 if track_kind.lower() not in ('captions', 'subtitles'):
3070 continue
3071 track_url = urljoin(base_url, track.get('file'))
3072 if not track_url:
3073 continue
3074 subtitles.setdefault(track.get('label') or 'en', []).append({
3075 'url': self._proto_relative_url(track_url)
3076 })
3077
3078 entry = {
3079 'id': this_video_id,
3080 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3081 'description': clean_html(video_data.get('description')),
3082 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3083 'timestamp': int_or_none(video_data.get('pubdate')),
3084 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3085 'subtitles': subtitles,
3086 }
3087 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3088 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3089 entry.update({
3090 '_type': 'url_transparent',
3091 'url': formats[0]['url'],
3092 })
3093 else:
3094 self._sort_formats(formats)
3095 entry['formats'] = formats
3096 entries.append(entry)
3097 if len(entries) == 1:
3098 return entries[0]
3099 else:
3100 return self.playlist_result(entries)
3101
3102 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3103 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3104 urls = []
3105 formats = []
3106 for source in jwplayer_sources_data:
3107 if not isinstance(source, dict):
3108 continue
3109 source_url = urljoin(
3110 base_url, self._proto_relative_url(source.get('file')))
3111 if not source_url or source_url in urls:
3112 continue
3113 urls.append(source_url)
3114 source_type = source.get('type') or ''
3115 ext = mimetype2ext(source_type) or determine_ext(source_url)
3116 if source_type == 'hls' or ext == 'm3u8':
3117 formats.extend(self._extract_m3u8_formats(
3118 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3119 m3u8_id=m3u8_id, fatal=False))
3120 elif source_type == 'dash' or ext == 'mpd':
3121 formats.extend(self._extract_mpd_formats(
3122 source_url, video_id, mpd_id=mpd_id, fatal=False))
3123 elif ext == 'smil':
3124 formats.extend(self._extract_smil_formats(
3125 source_url, video_id, fatal=False))
3126 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3127 elif source_type.startswith('audio') or ext in (
3128 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3129 formats.append({
3130 'url': source_url,
3131 'vcodec': 'none',
3132 'ext': ext,
3133 })
3134 else:
3135 height = int_or_none(source.get('height'))
3136 if height is None:
3137 # Often no height is provided but there is a label in
3138 # format like "1080p", "720p SD", or 1080.
3139 height = int_or_none(self._search_regex(
3140 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3141 'height', default=None))
3142 a_format = {
3143 'url': source_url,
3144 'width': int_or_none(source.get('width')),
3145 'height': height,
3146 'tbr': int_or_none(source.get('bitrate')),
3147 'ext': ext,
3148 }
3149 if source_url.startswith('rtmp'):
3150 a_format['ext'] = 'flv'
3151 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3152 # of jwplayer.flash.swf
3153 rtmp_url_parts = re.split(
3154 r'((?:mp4|mp3|flv):)', source_url, 1)
3155 if len(rtmp_url_parts) == 3:
3156 rtmp_url, prefix, play_path = rtmp_url_parts
3157 a_format.update({
3158 'url': rtmp_url,
3159 'play_path': prefix + play_path,
3160 })
3161 if rtmp_params:
3162 a_format.update(rtmp_params)
3163 formats.append(a_format)
3164 return formats
3165
3166 def _live_title(self, name):
3167 """ Generate the title for a live video """
3168 now = datetime.datetime.now()
3169 now_str = now.strftime('%Y-%m-%d %H:%M')
3170 return name + ' ' + now_str
3171
3172 def _int(self, v, name, fatal=False, **kwargs):
3173 res = int_or_none(v, **kwargs)
3174 if 'get_attr' in kwargs:
3175 print(getattr(v, kwargs['get_attr']))
3176 if res is None:
3177 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3178 if fatal:
3179 raise ExtractorError(msg)
3180 else:
3181 self._downloader.report_warning(msg)
3182 return res
3183
3184 def _float(self, v, name, fatal=False, **kwargs):
3185 res = float_or_none(v, **kwargs)
3186 if res is None:
3187 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3188 if fatal:
3189 raise ExtractorError(msg)
3190 else:
3191 self._downloader.report_warning(msg)
3192 return res
3193
3194 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3195 path='/', secure=False, discard=False, rest={}, **kwargs):
3196 cookie = compat_cookiejar_Cookie(
3197 0, name, value, port, port is not None, domain, True,
3198 domain.startswith('.'), path, True, secure, expire_time,
3199 discard, None, None, rest)
3200 self._downloader.cookiejar.set_cookie(cookie)
3201
3202 def _get_cookies(self, url):
3203 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3204 req = sanitized_Request(url)
3205 self._downloader.cookiejar.add_cookie_header(req)
3206 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3207
3208 def _apply_first_set_cookie_header(self, url_handle, cookie):
3209 """
3210 Apply first Set-Cookie header instead of the last. Experimental.
3211
3212 Some sites (e.g. [1-3]) may serve two cookies under the same name
3213 in Set-Cookie header and expect the first (old) one to be set rather
3214 than second (new). However, as of RFC6265 the newer one cookie
3215 should be set into cookie store what actually happens.
3216 We will workaround this issue by resetting the cookie to
3217 the first one manually.
3218 1. https://new.vk.com/
3219 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3220 3. https://learning.oreilly.com/
3221 """
3222 for header, cookies in url_handle.headers.items():
3223 if header.lower() != 'set-cookie':
3224 continue
3225 if sys.version_info[0] >= 3:
3226 cookies = cookies.encode('iso-8859-1')
3227 cookies = cookies.decode('utf-8')
3228 cookie_value = re.search(
3229 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3230 if cookie_value:
3231 value, domain = cookie_value.groups()
3232 self._set_cookie(domain, cookie, value)
3233 break
3234
3235 def get_testcases(self, include_onlymatching=False):
3236 t = getattr(self, '_TEST', None)
3237 if t:
3238 assert not hasattr(self, '_TESTS'), \
3239 '%s has _TEST and _TESTS' % type(self).__name__
3240 tests = [t]
3241 else:
3242 tests = getattr(self, '_TESTS', [])
3243 for t in tests:
3244 if not include_onlymatching and t.get('only_matching', False):
3245 continue
3246 t['name'] = type(self).__name__[:-len('IE')]
3247 yield t
3248
3249 def is_suitable(self, age_limit):
3250 """ Test whether the extractor is generally suitable for the given
3251 age limit (i.e. pornographic sites are not, all others usually are) """
3252
3253 any_restricted = False
3254 for tc in self.get_testcases(include_onlymatching=False):
3255 if tc.get('playlist', []):
3256 tc = tc['playlist'][0]
3257 is_restricted = age_restricted(
3258 tc.get('info_dict', {}).get('age_limit'), age_limit)
3259 if not is_restricted:
3260 return True
3261 any_restricted = any_restricted or is_restricted
3262 return not any_restricted
3263
3264 def extract_subtitles(self, *args, **kwargs):
3265 if (self._downloader.params.get('writesubtitles', False)
3266 or self._downloader.params.get('listsubtitles')):
3267 return self._get_subtitles(*args, **kwargs)
3268 return {}
3269
3270 def _get_subtitles(self, *args, **kwargs):
3271 raise NotImplementedError('This method must be implemented by subclasses')
3272
3273 @staticmethod
3274 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3275 """ Merge subtitle items for one language. Items with duplicated URLs
3276 will be dropped. """
3277 list1_urls = set([item['url'] for item in subtitle_list1])
3278 ret = list(subtitle_list1)
3279 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3280 return ret
3281
3282 @classmethod
3283 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3284 """ Merge two subtitle dictionaries, language by language. """
3285 ret = dict(subtitle_dict1)
3286 for lang in subtitle_dict2:
3287 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3288 return ret
3289
3290 def extract_automatic_captions(self, *args, **kwargs):
3291 if (self._downloader.params.get('writeautomaticsub', False)
3292 or self._downloader.params.get('listsubtitles')):
3293 return self._get_automatic_captions(*args, **kwargs)
3294 return {}
3295
3296 def _get_automatic_captions(self, *args, **kwargs):
3297 raise NotImplementedError('This method must be implemented by subclasses')
3298
3299 def mark_watched(self, *args, **kwargs):
3300 if (self._downloader.params.get('mark_watched', False)
3301 and (self._get_login_info()[0] is not None
3302 or self._downloader.params.get('cookiefile') is not None)):
3303 self._mark_watched(*args, **kwargs)
3304
3305 def _mark_watched(self, *args, **kwargs):
3306 raise NotImplementedError('This method must be implemented by subclasses')
3307
3308 def geo_verification_headers(self):
3309 headers = {}
3310 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3311 if geo_verification_proxy:
3312 headers['Ytdl-request-proxy'] = geo_verification_proxy
3313 return headers
3314
3315 def _generic_id(self, url):
3316 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3317
3318 def _generic_title(self, url):
3319 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3320
3321
3322 class SearchInfoExtractor(InfoExtractor):
3323 """
3324 Base class for paged search queries extractors.
3325 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3326 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3327 """
3328
3329 @classmethod
3330 def _make_valid_url(cls):
3331 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3332
3333 @classmethod
3334 def suitable(cls, url):
3335 return re.match(cls._make_valid_url(), url) is not None
3336
3337 def _real_extract(self, query):
3338 mobj = re.match(self._make_valid_url(), query)
3339 if mobj is None:
3340 raise ExtractorError('Invalid search query "%s"' % query)
3341
3342 prefix = mobj.group('prefix')
3343 query = mobj.group('query')
3344 if prefix == '':
3345 return self._get_n_results(query, 1)
3346 elif prefix == 'all':
3347 return self._get_n_results(query, self._MAX_RESULTS)
3348 else:
3349 n = int(prefix)
3350 if n <= 0:
3351 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3352 elif n > self._MAX_RESULTS:
3353 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3354 n = self._MAX_RESULTS
3355 return self._get_n_results(query, n)
3356
3357 def _get_n_results(self, query, n):
3358 """Get a specified number of results for a query"""
3359 raise NotImplementedError('This method must be implemented by subclasses')
3360
3361 @property
3362 def SEARCH_KEY(self):
3363 return self._SEARCH_KEY