]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
Add option `--ignore-no-formats-error`
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import ssl
14 import sys
15 import time
16 import math
17
18 from ..compat import (
19 compat_cookiejar_Cookie,
20 compat_cookies_SimpleCookie,
21 compat_etree_Element,
22 compat_etree_fromstring,
23 compat_getpass,
24 compat_integer_types,
25 compat_http_client,
26 compat_os_name,
27 compat_str,
28 compat_urllib_error,
29 compat_urllib_parse_unquote,
30 compat_urllib_parse_urlencode,
31 compat_urllib_request,
32 compat_urlparse,
33 compat_xml_parse_error,
34 )
35 from ..downloader import FileDownloader
36 from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39 )
40 from ..utils import (
41 NO_DEFAULT,
42 age_restricted,
43 base_url,
44 bug_reports_message,
45 clean_html,
46 compiled_regex_type,
47 determine_ext,
48 determine_protocol,
49 dict_get,
50 error_to_compat_str,
51 ExtractorError,
52 extract_attributes,
53 fix_xml_ampersands,
54 float_or_none,
55 GeoRestrictedError,
56 GeoUtils,
57 int_or_none,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 orderedSet,
62 parse_bitrate,
63 parse_codecs,
64 parse_duration,
65 parse_iso8601,
66 parse_m3u8_attributes,
67 parse_resolution,
68 RegexNotFoundError,
69 sanitized_Request,
70 sanitize_filename,
71 str_or_none,
72 str_to_int,
73 strip_or_none,
74 unescapeHTML,
75 unified_strdate,
76 unified_timestamp,
77 update_Request,
78 update_url_query,
79 urljoin,
80 url_basename,
81 url_or_none,
82 xpath_element,
83 xpath_text,
84 xpath_with_ns,
85 )
86
87
88 class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
95 passed to the YoutubeDL. The YoutubeDL processes this
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
99 The type field determines the type of the result.
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
104
105 id: Video identifier.
106 title: Video title, unescaped.
107
108 Additionally, it must contain either a formats entry or a url one:
109
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
124 is parsed from a string (in case of
125 fragmented media)
126 for MSS - URL of the ISM manifest.
127 * manifest_url
128 The URL of the manifest file in case of
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
134 * ext Will be calculated from URL if missing
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
146 * resolution Textual description of width and height
147 * tbr Average bitrate of audio and video in KBit/s
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
150 * asr Audio sampling rate in Hertz
151 * vbr Average video bitrate in KBit/s
152 * fps Frame rate
153 * vcodec Name of the video codec in use
154 * container Name of the container format
155 * filesize The number of bytes, if known in advance
156 * filesize_approx An estimate for the number of bytes
157 * player_url SWF Player URL (used for rtmpdump).
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
160 "http", "https", "rtsp", "rtmp", "rtmpe",
161 "m3u8", "m3u8_native" or "http_dash_segments".
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
177 * preference Order number of this format. If this field is
178 present and not None, the formats get sorted
179 by this field, regardless of all other values.
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
200 * stretched_ratio If given and not 1, indicates that the
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
207
208 url: Final video URL.
209 ext: Video filename extension.
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
212
213 The following fields are optional:
214
215 alt_title: A secondary title of the video.
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
220 thumbnails: A list of dictionaries, with the following entries:
221 * "id" (optional, string) - Thumbnail format ID
222 * "url"
223 * "preference" (optional, int) - quality of the image
224 * "width" (optional, int)
225 * "height" (optional, int)
226 * "resolution" (optional, string "{width}x{height}",
227 deprecated)
228 * "filesize" (optional, int)
229 thumbnail: Full URL to a video thumbnail image.
230 description: Full video description.
231 uploader: Full name of the video uploader.
232 license: License name the video is licensed under.
233 creator: The creator of the video.
234 release_timestamp: UNIX timestamp of the moment the video was released.
235 release_date: The date (YYYYMMDD) when the video was released.
236 timestamp: UNIX timestamp of the moment the video was uploaded
237 upload_date: Video upload date (YYYYMMDD).
238 If not explicitly set, calculated from timestamp.
239 uploader_id: Nickname or id of the video uploader.
240 uploader_url: Full URL to a personal webpage of the video uploader.
241 channel: Full name of the channel the video is uploaded on.
242 Note that channel fields may or may not repeat uploader
243 fields. This depends on a particular extractor.
244 channel_id: Id of the channel.
245 channel_url: Full URL to a channel webpage.
246 location: Physical location where the video was filmed.
247 subtitles: The available subtitles as a dictionary in the format
248 {tag: subformats}. "tag" is usually a language code, and
249 "subformats" is a list sorted from lower to higher
250 preference, each element is a dictionary with the "ext"
251 entry and one of:
252 * "data": The subtitles file contents
253 * "url": A URL pointing to the subtitles file
254 "ext" will be calculated from URL if missing
255 automatic_captions: Like 'subtitles'; contains automatically generated
256 captions instead of normal subtitles
257 duration: Length of the video in seconds, as an integer or float.
258 view_count: How many users have watched the video on the platform.
259 like_count: Number of positive ratings of the video
260 dislike_count: Number of negative ratings of the video
261 repost_count: Number of reposts of the video
262 average_rating: Average rating give by users, the scale used depends on the webpage
263 comment_count: Number of comments on the video
264 comments: A list of comments, each with one or more of the following
265 properties (all but one of text or html optional):
266 * "author" - human-readable name of the comment author
267 * "author_id" - user ID of the comment author
268 * "author_thumbnail" - The thumbnail of the comment author
269 * "id" - Comment ID
270 * "html" - Comment as HTML
271 * "text" - Plain text of the comment
272 * "timestamp" - UNIX timestamp of comment
273 * "parent" - ID of the comment this one is replying to.
274 Set to "root" to indicate that this is a
275 comment to the original video.
276 * "like_count" - Number of positive ratings of the comment
277 * "dislike_count" - Number of negative ratings of the comment
278 * "is_favorited" - Whether the comment is marked as
279 favorite by the video uploader
280 * "author_is_uploader" - Whether the comment is made by
281 the video uploader
282 age_limit: Age restriction for the video, as an integer (years)
283 webpage_url: The URL to the video webpage, if given to yt-dlp it
284 should allow to get the same result again. (It will be set
285 by YoutubeDL if it's missing)
286 categories: A list of categories that the video falls in, for example
287 ["Sports", "Berlin"]
288 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
289 is_live: True, False, or None (=unknown). Whether this video is a
290 live stream that goes on instead of a fixed-length video.
291 was_live: True, False, or None (=unknown). Whether this video was
292 originally a live stream.
293 start_time: Time in seconds where the reproduction should start, as
294 specified in the URL.
295 end_time: Time in seconds where the reproduction should end, as
296 specified in the URL.
297 chapters: A list of dictionaries, with the following entries:
298 * "start_time" - The start time of the chapter in seconds
299 * "end_time" - The end time of the chapter in seconds
300 * "title" (optional, string)
301 playable_in_embed: Whether this video is allowed to play in embedded
302 players on other sites. Can be True (=always allowed),
303 False (=never allowed), None (=unknown), or a string
304 specifying the criteria for embedability (Eg: 'whitelist')
305 availability: Under what condition the video is available. One of
306 'private', 'premium_only', 'subscriber_only', 'needs_auth',
307 'unlisted' or 'public'. Use 'InfoExtractor._availability'
308 to set it
309 __post_extractor: A function to be called just before the metadata is
310 written to either disk, logger or console. The function
311 must return a dict which will be added to the info_dict.
312 This is usefull for additional information that is
313 time-consuming to extract. Note that the fields thus
314 extracted will not be available to output template and
315 match_filter. So, only "comments" and "comment_count" are
316 currently allowed to be extracted via this method.
317
318 The following fields should only be used when the video belongs to some logical
319 chapter or section:
320
321 chapter: Name or title of the chapter the video belongs to.
322 chapter_number: Number of the chapter the video belongs to, as an integer.
323 chapter_id: Id of the chapter the video belongs to, as a unicode string.
324
325 The following fields should only be used when the video is an episode of some
326 series, programme or podcast:
327
328 series: Title of the series or programme the video episode belongs to.
329 season: Title of the season the video episode belongs to.
330 season_number: Number of the season the video episode belongs to, as an integer.
331 season_id: Id of the season the video episode belongs to, as a unicode string.
332 episode: Title of the video episode. Unlike mandatory video title field,
333 this field should denote the exact title of the video episode
334 without any kind of decoration.
335 episode_number: Number of the video episode within a season, as an integer.
336 episode_id: Id of the video episode, as a unicode string.
337
338 The following fields should only be used when the media is a track or a part of
339 a music album:
340
341 track: Title of the track.
342 track_number: Number of the track within an album or a disc, as an integer.
343 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
344 as a unicode string.
345 artist: Artist(s) of the track.
346 genre: Genre(s) of the track.
347 album: Title of the album the track belongs to.
348 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
349 album_artist: List of all artists appeared on the album (e.g.
350 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
351 and compilations).
352 disc_number: Number of the disc or other physical medium the track belongs to,
353 as an integer.
354 release_year: Year (YYYY) when the album was released.
355
356 Unless mentioned otherwise, the fields should be Unicode strings.
357
358 Unless mentioned otherwise, None is equivalent to absence of information.
359
360
361 _type "playlist" indicates multiple videos.
362 There must be a key "entries", which is a list, an iterable, or a PagedList
363 object, each element of which is a valid dictionary by this specification.
364
365 Additionally, playlists can have "id", "title", and any other relevent
366 attributes with the same semantics as videos (see above).
367
368
369 _type "multi_video" indicates that there are multiple videos that
370 form a single show, for examples multiple acts of an opera or TV episode.
371 It must have an entries key like a playlist and contain all the keys
372 required for a video at the same time.
373
374
375 _type "url" indicates that the video must be extracted from another
376 location, possibly by a different extractor. Its only required key is:
377 "url" - the next URL to extract.
378 The key "ie_key" can be set to the class name (minus the trailing "IE",
379 e.g. "Youtube") if the extractor class is known in advance.
380 Additionally, the dictionary may have any properties of the resolved entity
381 known in advance, for example "title" if the title of the referred video is
382 known ahead of time.
383
384
385 _type "url_transparent" entities have the same specification as "url", but
386 indicate that the given additional information is more precise than the one
387 associated with the resolved URL.
388 This is useful when a site employs a video service that hosts the video and
389 its technical metadata, but that video service does not embed a useful
390 title, description etc.
391
392
393 Subclasses of this one should re-define the _real_initialize() and
394 _real_extract() methods and define a _VALID_URL regexp.
395 Probably, they should also be added to the list of extractors.
396
397 _GEO_BYPASS attribute may be set to False in order to disable
398 geo restriction bypass mechanisms for a particular extractor.
399 Though it won't disable explicit geo restriction bypass based on
400 country code provided with geo_bypass_country.
401
402 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
403 countries for this extractor. One of these countries will be used by
404 geo restriction bypass mechanism right away in order to bypass
405 geo restriction, of course, if the mechanism is not disabled.
406
407 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
408 IP blocks in CIDR notation for this extractor. One of these IP blocks
409 will be used by geo restriction bypass mechanism similarly
410 to _GEO_COUNTRIES.
411
412 Finally, the _WORKING attribute should be set to False for broken IEs
413 in order to warn the users and skip the tests.
414 """
415
416 _ready = False
417 _downloader = None
418 _x_forwarded_for_ip = None
419 _GEO_BYPASS = True
420 _GEO_COUNTRIES = None
421 _GEO_IP_BLOCKS = None
422 _WORKING = True
423
424 def __init__(self, downloader=None):
425 """Constructor. Receives an optional downloader."""
426 self._ready = False
427 self._x_forwarded_for_ip = None
428 self.set_downloader(downloader)
429
430 @classmethod
431 def suitable(cls, url):
432 """Receives a URL and returns True if suitable for this IE."""
433
434 # This does not use has/getattr intentionally - we want to know whether
435 # we have cached the regexp for *this* class, whereas getattr would also
436 # match the superclass
437 if '_VALID_URL_RE' not in cls.__dict__:
438 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
439 return cls._VALID_URL_RE.match(url) is not None
440
441 @classmethod
442 def _match_id(cls, url):
443 if '_VALID_URL_RE' not in cls.__dict__:
444 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
445 m = cls._VALID_URL_RE.match(url)
446 assert m
447 return compat_str(m.group('id'))
448
449 @classmethod
450 def working(cls):
451 """Getter method for _WORKING."""
452 return cls._WORKING
453
454 def initialize(self):
455 """Initializes an instance (authentication, etc)."""
456 self._initialize_geo_bypass({
457 'countries': self._GEO_COUNTRIES,
458 'ip_blocks': self._GEO_IP_BLOCKS,
459 })
460 if not self._ready:
461 self._real_initialize()
462 self._ready = True
463
464 def _initialize_geo_bypass(self, geo_bypass_context):
465 """
466 Initialize geo restriction bypass mechanism.
467
468 This method is used to initialize geo bypass mechanism based on faking
469 X-Forwarded-For HTTP header. A random country from provided country list
470 is selected and a random IP belonging to this country is generated. This
471 IP will be passed as X-Forwarded-For HTTP header in all subsequent
472 HTTP requests.
473
474 This method will be used for initial geo bypass mechanism initialization
475 during the instance initialization with _GEO_COUNTRIES and
476 _GEO_IP_BLOCKS.
477
478 You may also manually call it from extractor's code if geo bypass
479 information is not available beforehand (e.g. obtained during
480 extraction) or due to some other reason. In this case you should pass
481 this information in geo bypass context passed as first argument. It may
482 contain following fields:
483
484 countries: List of geo unrestricted countries (similar
485 to _GEO_COUNTRIES)
486 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
487 (similar to _GEO_IP_BLOCKS)
488
489 """
490 if not self._x_forwarded_for_ip:
491
492 # Geo bypass mechanism is explicitly disabled by user
493 if not self._downloader.params.get('geo_bypass', True):
494 return
495
496 if not geo_bypass_context:
497 geo_bypass_context = {}
498
499 # Backward compatibility: previously _initialize_geo_bypass
500 # expected a list of countries, some 3rd party code may still use
501 # it this way
502 if isinstance(geo_bypass_context, (list, tuple)):
503 geo_bypass_context = {
504 'countries': geo_bypass_context,
505 }
506
507 # The whole point of geo bypass mechanism is to fake IP
508 # as X-Forwarded-For HTTP header based on some IP block or
509 # country code.
510
511 # Path 1: bypassing based on IP block in CIDR notation
512
513 # Explicit IP block specified by user, use it right away
514 # regardless of whether extractor is geo bypassable or not
515 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
516
517 # Otherwise use random IP block from geo bypass context but only
518 # if extractor is known as geo bypassable
519 if not ip_block:
520 ip_blocks = geo_bypass_context.get('ip_blocks')
521 if self._GEO_BYPASS and ip_blocks:
522 ip_block = random.choice(ip_blocks)
523
524 if ip_block:
525 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
526 if self._downloader.params.get('verbose', False):
527 self._downloader.to_screen(
528 '[debug] Using fake IP %s as X-Forwarded-For.'
529 % self._x_forwarded_for_ip)
530 return
531
532 # Path 2: bypassing based on country code
533
534 # Explicit country code specified by user, use it right away
535 # regardless of whether extractor is geo bypassable or not
536 country = self._downloader.params.get('geo_bypass_country', None)
537
538 # Otherwise use random country code from geo bypass context but
539 # only if extractor is known as geo bypassable
540 if not country:
541 countries = geo_bypass_context.get('countries')
542 if self._GEO_BYPASS and countries:
543 country = random.choice(countries)
544
545 if country:
546 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
547 if self._downloader.params.get('verbose', False):
548 self._downloader.to_screen(
549 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
550 % (self._x_forwarded_for_ip, country.upper()))
551
552 def extract(self, url):
553 """Extracts URL information and returns it in list of dicts."""
554 try:
555 for _ in range(2):
556 try:
557 self.initialize()
558 ie_result = self._real_extract(url)
559 if self._x_forwarded_for_ip:
560 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
561 return ie_result
562 except GeoRestrictedError as e:
563 if self.__maybe_fake_ip_and_retry(e.countries):
564 continue
565 raise
566 except ExtractorError:
567 raise
568 except compat_http_client.IncompleteRead as e:
569 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
570 except (KeyError, StopIteration) as e:
571 raise ExtractorError('An extractor error has occurred.', cause=e)
572
573 def __maybe_fake_ip_and_retry(self, countries):
574 if (not self._downloader.params.get('geo_bypass_country', None)
575 and self._GEO_BYPASS
576 and self._downloader.params.get('geo_bypass', True)
577 and not self._x_forwarded_for_ip
578 and countries):
579 country_code = random.choice(countries)
580 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
581 if self._x_forwarded_for_ip:
582 self.report_warning(
583 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
584 % (self._x_forwarded_for_ip, country_code.upper()))
585 return True
586 return False
587
588 def set_downloader(self, downloader):
589 """Sets the downloader for this IE."""
590 self._downloader = downloader
591
592 def _real_initialize(self):
593 """Real initialization process. Redefine in subclasses."""
594 pass
595
596 def _real_extract(self, url):
597 """Real extraction process. Redefine in subclasses."""
598 pass
599
600 @classmethod
601 def ie_key(cls):
602 """A string for getting the InfoExtractor with get_info_extractor"""
603 return compat_str(cls.__name__[:-2])
604
605 @property
606 def IE_NAME(self):
607 return compat_str(type(self).__name__[:-2])
608
609 @staticmethod
610 def __can_accept_status_code(err, expected_status):
611 assert isinstance(err, compat_urllib_error.HTTPError)
612 if expected_status is None:
613 return False
614 if isinstance(expected_status, compat_integer_types):
615 return err.code == expected_status
616 elif isinstance(expected_status, (list, tuple)):
617 return err.code in expected_status
618 elif callable(expected_status):
619 return expected_status(err.code) is True
620 else:
621 assert False
622
623 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
624 """
625 Return the response handle.
626
627 See _download_webpage docstring for arguments specification.
628 """
629 if not self._downloader._first_webpage_request:
630 sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
631 if sleep_interval > 0:
632 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
633 time.sleep(sleep_interval)
634 else:
635 self._downloader._first_webpage_request = False
636
637 if note is None:
638 self.report_download_webpage(video_id)
639 elif note is not False:
640 if video_id is None:
641 self.to_screen('%s' % (note,))
642 else:
643 self.to_screen('%s: %s' % (video_id, note))
644
645 # Some sites check X-Forwarded-For HTTP header in order to figure out
646 # the origin of the client behind proxy. This allows bypassing geo
647 # restriction by faking this header's value to IP that belongs to some
648 # geo unrestricted country. We will do so once we encounter any
649 # geo restriction error.
650 if self._x_forwarded_for_ip:
651 if 'X-Forwarded-For' not in headers:
652 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
653
654 if isinstance(url_or_request, compat_urllib_request.Request):
655 url_or_request = update_Request(
656 url_or_request, data=data, headers=headers, query=query)
657 else:
658 if query:
659 url_or_request = update_url_query(url_or_request, query)
660 if data is not None or headers:
661 url_or_request = sanitized_Request(url_or_request, data, headers)
662 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
663 if hasattr(ssl, 'CertificateError'):
664 exceptions.append(ssl.CertificateError)
665 try:
666 return self._downloader.urlopen(url_or_request)
667 except tuple(exceptions) as err:
668 if isinstance(err, compat_urllib_error.HTTPError):
669 if self.__can_accept_status_code(err, expected_status):
670 # Retain reference to error to prevent file object from
671 # being closed before it can be read. Works around the
672 # effects of <https://bugs.python.org/issue15002>
673 # introduced in Python 3.4.1.
674 err.fp._error = err
675 return err.fp
676
677 if errnote is False:
678 return False
679 if errnote is None:
680 errnote = 'Unable to download webpage'
681
682 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
683 if fatal:
684 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
685 else:
686 self.report_warning(errmsg)
687 return False
688
689 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
690 """
691 Return a tuple (page content as string, URL handle).
692
693 See _download_webpage docstring for arguments specification.
694 """
695 # Strip hashes from the URL (#1038)
696 if isinstance(url_or_request, (compat_str, str)):
697 url_or_request = url_or_request.partition('#')[0]
698
699 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
700 if urlh is False:
701 assert not fatal
702 return False
703 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
704 return (content, urlh)
705
706 @staticmethod
707 def _guess_encoding_from_content(content_type, webpage_bytes):
708 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
709 if m:
710 encoding = m.group(1)
711 else:
712 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
713 webpage_bytes[:1024])
714 if m:
715 encoding = m.group(1).decode('ascii')
716 elif webpage_bytes.startswith(b'\xff\xfe'):
717 encoding = 'utf-16'
718 else:
719 encoding = 'utf-8'
720
721 return encoding
722
723 def __check_blocked(self, content):
724 first_block = content[:512]
725 if ('<title>Access to this site is blocked</title>' in content
726 and 'Websense' in first_block):
727 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
728 blocked_iframe = self._html_search_regex(
729 r'<iframe src="([^"]+)"', content,
730 'Websense information URL', default=None)
731 if blocked_iframe:
732 msg += ' Visit %s for more details' % blocked_iframe
733 raise ExtractorError(msg, expected=True)
734 if '<title>The URL you requested has been blocked</title>' in first_block:
735 msg = (
736 'Access to this webpage has been blocked by Indian censorship. '
737 'Use a VPN or proxy server (with --proxy) to route around it.')
738 block_msg = self._html_search_regex(
739 r'</h1><p>(.*?)</p>',
740 content, 'block message', default=None)
741 if block_msg:
742 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
743 raise ExtractorError(msg, expected=True)
744 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
745 and 'blocklist.rkn.gov.ru' in content):
746 raise ExtractorError(
747 'Access to this webpage has been blocked by decision of the Russian government. '
748 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
749 expected=True)
750
751 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
752 content_type = urlh.headers.get('Content-Type', '')
753 webpage_bytes = urlh.read()
754 if prefix is not None:
755 webpage_bytes = prefix + webpage_bytes
756 if not encoding:
757 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
758 if self._downloader.params.get('dump_intermediate_pages', False):
759 self.to_screen('Dumping request to ' + urlh.geturl())
760 dump = base64.b64encode(webpage_bytes).decode('ascii')
761 self._downloader.to_screen(dump)
762 if self._downloader.params.get('write_pages', False):
763 basen = '%s_%s' % (video_id, urlh.geturl())
764 if len(basen) > 240:
765 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
766 basen = basen[:240 - len(h)] + h
767 raw_filename = basen + '.dump'
768 filename = sanitize_filename(raw_filename, restricted=True)
769 self.to_screen('Saving request to ' + filename)
770 # Working around MAX_PATH limitation on Windows (see
771 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
772 if compat_os_name == 'nt':
773 absfilepath = os.path.abspath(filename)
774 if len(absfilepath) > 259:
775 filename = '\\\\?\\' + absfilepath
776 with open(filename, 'wb') as outf:
777 outf.write(webpage_bytes)
778
779 try:
780 content = webpage_bytes.decode(encoding, 'replace')
781 except LookupError:
782 content = webpage_bytes.decode('utf-8', 'replace')
783
784 self.__check_blocked(content)
785
786 return content
787
788 def _download_webpage(
789 self, url_or_request, video_id, note=None, errnote=None,
790 fatal=True, tries=1, timeout=5, encoding=None, data=None,
791 headers={}, query={}, expected_status=None):
792 """
793 Return the data of the page as a string.
794
795 Arguments:
796 url_or_request -- plain text URL as a string or
797 a compat_urllib_request.Requestobject
798 video_id -- Video/playlist/item identifier (string)
799
800 Keyword arguments:
801 note -- note printed before downloading (string)
802 errnote -- note printed in case of an error (string)
803 fatal -- flag denoting whether error should be considered fatal,
804 i.e. whether it should cause ExtractionError to be raised,
805 otherwise a warning will be reported and extraction continued
806 tries -- number of tries
807 timeout -- sleep interval between tries
808 encoding -- encoding for a page content decoding, guessed automatically
809 when not explicitly specified
810 data -- POST data (bytes)
811 headers -- HTTP headers (dict)
812 query -- URL query (dict)
813 expected_status -- allows to accept failed HTTP requests (non 2xx
814 status code) by explicitly specifying a set of accepted status
815 codes. Can be any of the following entities:
816 - an integer type specifying an exact failed status code to
817 accept
818 - a list or a tuple of integer types specifying a list of
819 failed status codes to accept
820 - a callable accepting an actual failed status code and
821 returning True if it should be accepted
822 Note that this argument does not affect success status codes (2xx)
823 which are always accepted.
824 """
825
826 success = False
827 try_count = 0
828 while success is False:
829 try:
830 res = self._download_webpage_handle(
831 url_or_request, video_id, note, errnote, fatal,
832 encoding=encoding, data=data, headers=headers, query=query,
833 expected_status=expected_status)
834 success = True
835 except compat_http_client.IncompleteRead as e:
836 try_count += 1
837 if try_count >= tries:
838 raise e
839 self._sleep(timeout, video_id)
840 if res is False:
841 return res
842 else:
843 content, _ = res
844 return content
845
846 def _download_xml_handle(
847 self, url_or_request, video_id, note='Downloading XML',
848 errnote='Unable to download XML', transform_source=None,
849 fatal=True, encoding=None, data=None, headers={}, query={},
850 expected_status=None):
851 """
852 Return a tuple (xml as an compat_etree_Element, URL handle).
853
854 See _download_webpage docstring for arguments specification.
855 """
856 res = self._download_webpage_handle(
857 url_or_request, video_id, note, errnote, fatal=fatal,
858 encoding=encoding, data=data, headers=headers, query=query,
859 expected_status=expected_status)
860 if res is False:
861 return res
862 xml_string, urlh = res
863 return self._parse_xml(
864 xml_string, video_id, transform_source=transform_source,
865 fatal=fatal), urlh
866
867 def _download_xml(
868 self, url_or_request, video_id,
869 note='Downloading XML', errnote='Unable to download XML',
870 transform_source=None, fatal=True, encoding=None,
871 data=None, headers={}, query={}, expected_status=None):
872 """
873 Return the xml as an compat_etree_Element.
874
875 See _download_webpage docstring for arguments specification.
876 """
877 res = self._download_xml_handle(
878 url_or_request, video_id, note=note, errnote=errnote,
879 transform_source=transform_source, fatal=fatal, encoding=encoding,
880 data=data, headers=headers, query=query,
881 expected_status=expected_status)
882 return res if res is False else res[0]
883
884 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
885 if transform_source:
886 xml_string = transform_source(xml_string)
887 try:
888 return compat_etree_fromstring(xml_string.encode('utf-8'))
889 except compat_xml_parse_error as ve:
890 errmsg = '%s: Failed to parse XML ' % video_id
891 if fatal:
892 raise ExtractorError(errmsg, cause=ve)
893 else:
894 self.report_warning(errmsg + str(ve))
895
896 def _download_json_handle(
897 self, url_or_request, video_id, note='Downloading JSON metadata',
898 errnote='Unable to download JSON metadata', transform_source=None,
899 fatal=True, encoding=None, data=None, headers={}, query={},
900 expected_status=None):
901 """
902 Return a tuple (JSON object, URL handle).
903
904 See _download_webpage docstring for arguments specification.
905 """
906 res = self._download_webpage_handle(
907 url_or_request, video_id, note, errnote, fatal=fatal,
908 encoding=encoding, data=data, headers=headers, query=query,
909 expected_status=expected_status)
910 if res is False:
911 return res
912 json_string, urlh = res
913 return self._parse_json(
914 json_string, video_id, transform_source=transform_source,
915 fatal=fatal), urlh
916
917 def _download_json(
918 self, url_or_request, video_id, note='Downloading JSON metadata',
919 errnote='Unable to download JSON metadata', transform_source=None,
920 fatal=True, encoding=None, data=None, headers={}, query={},
921 expected_status=None):
922 """
923 Return the JSON object as a dict.
924
925 See _download_webpage docstring for arguments specification.
926 """
927 res = self._download_json_handle(
928 url_or_request, video_id, note=note, errnote=errnote,
929 transform_source=transform_source, fatal=fatal, encoding=encoding,
930 data=data, headers=headers, query=query,
931 expected_status=expected_status)
932 return res if res is False else res[0]
933
934 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
935 if transform_source:
936 json_string = transform_source(json_string)
937 try:
938 return json.loads(json_string)
939 except ValueError as ve:
940 errmsg = '%s: Failed to parse JSON ' % video_id
941 if fatal:
942 raise ExtractorError(errmsg, cause=ve)
943 else:
944 self.report_warning(errmsg + str(ve))
945
946 def report_warning(self, msg, video_id=None):
947 idstr = '' if video_id is None else '%s: ' % video_id
948 self._downloader.report_warning(
949 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
950
951 def to_screen(self, msg):
952 """Print msg to screen, prefixing it with '[ie_name]'"""
953 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
954
955 def report_extraction(self, id_or_name):
956 """Report information extraction."""
957 self.to_screen('%s: Extracting information' % id_or_name)
958
959 def report_download_webpage(self, video_id):
960 """Report webpage download."""
961 self.to_screen('%s: Downloading webpage' % video_id)
962
963 def report_age_confirmation(self):
964 """Report attempt to confirm age."""
965 self.to_screen('Confirming age')
966
967 def report_login(self):
968 """Report attempt to log in."""
969 self.to_screen('Logging in')
970
971 def raise_login_required(
972 self, msg='This video is only available for registered users', metadata_available=False):
973 if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
974 self.report_warning(msg)
975 raise ExtractorError(
976 '%s. Use --cookies, --username and --password or --netrc to provide account credentials' % msg,
977 expected=True)
978
979 def raise_geo_restricted(
980 self, msg='This video is not available from your location due to geo restriction',
981 countries=None, metadata_available=False):
982 if metadata_available and self._downloader.params.get('ignore_no_formats_error'):
983 self.report_warning(msg)
984 else:
985 raise GeoRestrictedError(msg, countries=countries)
986
987 def raise_no_formats(self, msg, expected=False, video_id=None):
988 if expected and self._downloader.params.get('ignore_no_formats_error'):
989 self.report_warning(msg, video_id)
990 else:
991 raise ExtractorError(msg, expected=expected, video_id=video_id)
992
993 # Methods for following #608
994 @staticmethod
995 def url_result(url, ie=None, video_id=None, video_title=None):
996 """Returns a URL that points to a page that should be processed"""
997 # TODO: ie should be the class used for getting the info
998 video_info = {'_type': 'url',
999 'url': url,
1000 'ie_key': ie}
1001 if video_id is not None:
1002 video_info['id'] = video_id
1003 if video_title is not None:
1004 video_info['title'] = video_title
1005 return video_info
1006
1007 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1008 urls = orderedSet(
1009 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1010 for m in matches)
1011 return self.playlist_result(
1012 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1013
1014 @staticmethod
1015 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1016 """Returns a playlist"""
1017 video_info = {'_type': 'playlist',
1018 'entries': entries}
1019 video_info.update(kwargs)
1020 if playlist_id:
1021 video_info['id'] = playlist_id
1022 if playlist_title:
1023 video_info['title'] = playlist_title
1024 if playlist_description is not None:
1025 video_info['description'] = playlist_description
1026 return video_info
1027
1028 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1029 """
1030 Perform a regex search on the given string, using a single or a list of
1031 patterns returning the first matching group.
1032 In case of failure return a default value or raise a WARNING or a
1033 RegexNotFoundError, depending on fatal, specifying the field name.
1034 """
1035 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1036 mobj = re.search(pattern, string, flags)
1037 else:
1038 for p in pattern:
1039 mobj = re.search(p, string, flags)
1040 if mobj:
1041 break
1042
1043 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1044 _name = '\033[0;34m%s\033[0m' % name
1045 else:
1046 _name = name
1047
1048 if mobj:
1049 if group is None:
1050 # return the first matching group
1051 return next(g for g in mobj.groups() if g is not None)
1052 else:
1053 return mobj.group(group)
1054 elif default is not NO_DEFAULT:
1055 return default
1056 elif fatal:
1057 raise RegexNotFoundError('Unable to extract %s' % _name)
1058 else:
1059 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1060 return None
1061
1062 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1063 """
1064 Like _search_regex, but strips HTML tags and unescapes entities.
1065 """
1066 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1067 if res:
1068 return clean_html(res).strip()
1069 else:
1070 return res
1071
1072 def _get_netrc_login_info(self, netrc_machine=None):
1073 username = None
1074 password = None
1075 netrc_machine = netrc_machine or self._NETRC_MACHINE
1076
1077 if self._downloader.params.get('usenetrc', False):
1078 try:
1079 info = netrc.netrc().authenticators(netrc_machine)
1080 if info is not None:
1081 username = info[0]
1082 password = info[2]
1083 else:
1084 raise netrc.NetrcParseError(
1085 'No authenticators for %s' % netrc_machine)
1086 except (IOError, netrc.NetrcParseError) as err:
1087 self.report_warning(
1088 'parsing .netrc: %s' % error_to_compat_str(err))
1089
1090 return username, password
1091
1092 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1093 """
1094 Get the login info as (username, password)
1095 First look for the manually specified credentials using username_option
1096 and password_option as keys in params dictionary. If no such credentials
1097 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1098 value.
1099 If there's no info available, return (None, None)
1100 """
1101 if self._downloader is None:
1102 return (None, None)
1103
1104 downloader_params = self._downloader.params
1105
1106 # Attempt to use provided username and password or .netrc data
1107 if downloader_params.get(username_option) is not None:
1108 username = downloader_params[username_option]
1109 password = downloader_params[password_option]
1110 else:
1111 username, password = self._get_netrc_login_info(netrc_machine)
1112
1113 return username, password
1114
1115 def _get_tfa_info(self, note='two-factor verification code'):
1116 """
1117 Get the two-factor authentication info
1118 TODO - asking the user will be required for sms/phone verify
1119 currently just uses the command line option
1120 If there's no info available, return None
1121 """
1122 if self._downloader is None:
1123 return None
1124 downloader_params = self._downloader.params
1125
1126 if downloader_params.get('twofactor') is not None:
1127 return downloader_params['twofactor']
1128
1129 return compat_getpass('Type %s and press [Return]: ' % note)
1130
1131 # Helper functions for extracting OpenGraph info
1132 @staticmethod
1133 def _og_regexes(prop):
1134 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1135 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1136 % {'prop': re.escape(prop)})
1137 template = r'<meta[^>]+?%s[^>]+?%s'
1138 return [
1139 template % (property_re, content_re),
1140 template % (content_re, property_re),
1141 ]
1142
1143 @staticmethod
1144 def _meta_regex(prop):
1145 return r'''(?isx)<meta
1146 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1147 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1148
1149 def _og_search_property(self, prop, html, name=None, **kargs):
1150 if not isinstance(prop, (list, tuple)):
1151 prop = [prop]
1152 if name is None:
1153 name = 'OpenGraph %s' % prop[0]
1154 og_regexes = []
1155 for p in prop:
1156 og_regexes.extend(self._og_regexes(p))
1157 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1158 if escaped is None:
1159 return None
1160 return unescapeHTML(escaped)
1161
1162 def _og_search_thumbnail(self, html, **kargs):
1163 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1164
1165 def _og_search_description(self, html, **kargs):
1166 return self._og_search_property('description', html, fatal=False, **kargs)
1167
1168 def _og_search_title(self, html, **kargs):
1169 return self._og_search_property('title', html, **kargs)
1170
1171 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1172 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1173 if secure:
1174 regexes = self._og_regexes('video:secure_url') + regexes
1175 return self._html_search_regex(regexes, html, name, **kargs)
1176
1177 def _og_search_url(self, html, **kargs):
1178 return self._og_search_property('url', html, **kargs)
1179
1180 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1181 if not isinstance(name, (list, tuple)):
1182 name = [name]
1183 if display_name is None:
1184 display_name = name[0]
1185 return self._html_search_regex(
1186 [self._meta_regex(n) for n in name],
1187 html, display_name, fatal=fatal, group='content', **kwargs)
1188
1189 def _dc_search_uploader(self, html):
1190 return self._html_search_meta('dc.creator', html, 'uploader')
1191
1192 def _rta_search(self, html):
1193 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1194 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1195 r' content="RTA-5042-1996-1400-1577-RTA"',
1196 html):
1197 return 18
1198 return 0
1199
1200 def _media_rating_search(self, html):
1201 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1202 rating = self._html_search_meta('rating', html)
1203
1204 if not rating:
1205 return None
1206
1207 RATING_TABLE = {
1208 'safe for kids': 0,
1209 'general': 8,
1210 '14 years': 14,
1211 'mature': 17,
1212 'restricted': 19,
1213 }
1214 return RATING_TABLE.get(rating.lower())
1215
1216 def _family_friendly_search(self, html):
1217 # See http://schema.org/VideoObject
1218 family_friendly = self._html_search_meta(
1219 'isFamilyFriendly', html, default=None)
1220
1221 if not family_friendly:
1222 return None
1223
1224 RATING_TABLE = {
1225 '1': 0,
1226 'true': 0,
1227 '0': 18,
1228 'false': 18,
1229 }
1230 return RATING_TABLE.get(family_friendly.lower())
1231
1232 def _twitter_search_player(self, html):
1233 return self._html_search_meta('twitter:player', html,
1234 'twitter card player')
1235
1236 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1237 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1238 default = kwargs.get('default', NO_DEFAULT)
1239 # JSON-LD may be malformed and thus `fatal` should be respected.
1240 # At the same time `default` may be passed that assumes `fatal=False`
1241 # for _search_regex. Let's simulate the same behavior here as well.
1242 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1243 json_ld = []
1244 for mobj in json_ld_list:
1245 json_ld_item = self._parse_json(
1246 mobj.group('json_ld'), video_id, fatal=fatal)
1247 if not json_ld_item:
1248 continue
1249 if isinstance(json_ld_item, dict):
1250 json_ld.append(json_ld_item)
1251 elif isinstance(json_ld_item, (list, tuple)):
1252 json_ld.extend(json_ld_item)
1253 if json_ld:
1254 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1255 if json_ld:
1256 return json_ld
1257 if default is not NO_DEFAULT:
1258 return default
1259 elif fatal:
1260 raise RegexNotFoundError('Unable to extract JSON-LD')
1261 else:
1262 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1263 return {}
1264
1265 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1266 if isinstance(json_ld, compat_str):
1267 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1268 if not json_ld:
1269 return {}
1270 info = {}
1271 if not isinstance(json_ld, (list, tuple, dict)):
1272 return info
1273 if isinstance(json_ld, dict):
1274 json_ld = [json_ld]
1275
1276 INTERACTION_TYPE_MAP = {
1277 'CommentAction': 'comment',
1278 'AgreeAction': 'like',
1279 'DisagreeAction': 'dislike',
1280 'LikeAction': 'like',
1281 'DislikeAction': 'dislike',
1282 'ListenAction': 'view',
1283 'WatchAction': 'view',
1284 'ViewAction': 'view',
1285 }
1286
1287 def extract_interaction_type(e):
1288 interaction_type = e.get('interactionType')
1289 if isinstance(interaction_type, dict):
1290 interaction_type = interaction_type.get('@type')
1291 return str_or_none(interaction_type)
1292
1293 def extract_interaction_statistic(e):
1294 interaction_statistic = e.get('interactionStatistic')
1295 if isinstance(interaction_statistic, dict):
1296 interaction_statistic = [interaction_statistic]
1297 if not isinstance(interaction_statistic, list):
1298 return
1299 for is_e in interaction_statistic:
1300 if not isinstance(is_e, dict):
1301 continue
1302 if is_e.get('@type') != 'InteractionCounter':
1303 continue
1304 interaction_type = extract_interaction_type(is_e)
1305 if not interaction_type:
1306 continue
1307 # For interaction count some sites provide string instead of
1308 # an integer (as per spec) with non digit characters (e.g. ",")
1309 # so extracting count with more relaxed str_to_int
1310 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1311 if interaction_count is None:
1312 continue
1313 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1314 if not count_kind:
1315 continue
1316 count_key = '%s_count' % count_kind
1317 if info.get(count_key) is not None:
1318 continue
1319 info[count_key] = interaction_count
1320
1321 def extract_video_object(e):
1322 assert e['@type'] == 'VideoObject'
1323 author = e.get('author')
1324 info.update({
1325 'url': url_or_none(e.get('contentUrl')),
1326 'title': unescapeHTML(e.get('name')),
1327 'description': unescapeHTML(e.get('description')),
1328 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1329 'duration': parse_duration(e.get('duration')),
1330 'timestamp': unified_timestamp(e.get('uploadDate')),
1331 # author can be an instance of 'Organization' or 'Person' types.
1332 # both types can have 'name' property(inherited from 'Thing' type). [1]
1333 # however some websites are using 'Text' type instead.
1334 # 1. https://schema.org/VideoObject
1335 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1336 'filesize': float_or_none(e.get('contentSize')),
1337 'tbr': int_or_none(e.get('bitrate')),
1338 'width': int_or_none(e.get('width')),
1339 'height': int_or_none(e.get('height')),
1340 'view_count': int_or_none(e.get('interactionCount')),
1341 })
1342 extract_interaction_statistic(e)
1343
1344 for e in json_ld:
1345 if '@context' in e:
1346 item_type = e.get('@type')
1347 if expected_type is not None and expected_type != item_type:
1348 continue
1349 if item_type in ('TVEpisode', 'Episode'):
1350 episode_name = unescapeHTML(e.get('name'))
1351 info.update({
1352 'episode': episode_name,
1353 'episode_number': int_or_none(e.get('episodeNumber')),
1354 'description': unescapeHTML(e.get('description')),
1355 })
1356 if not info.get('title') and episode_name:
1357 info['title'] = episode_name
1358 part_of_season = e.get('partOfSeason')
1359 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1360 info.update({
1361 'season': unescapeHTML(part_of_season.get('name')),
1362 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1363 })
1364 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1365 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1366 info['series'] = unescapeHTML(part_of_series.get('name'))
1367 elif item_type == 'Movie':
1368 info.update({
1369 'title': unescapeHTML(e.get('name')),
1370 'description': unescapeHTML(e.get('description')),
1371 'duration': parse_duration(e.get('duration')),
1372 'timestamp': unified_timestamp(e.get('dateCreated')),
1373 })
1374 elif item_type in ('Article', 'NewsArticle'):
1375 info.update({
1376 'timestamp': parse_iso8601(e.get('datePublished')),
1377 'title': unescapeHTML(e.get('headline')),
1378 'description': unescapeHTML(e.get('articleBody')),
1379 })
1380 elif item_type == 'VideoObject':
1381 extract_video_object(e)
1382 if expected_type is None:
1383 continue
1384 else:
1385 break
1386 video = e.get('video')
1387 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1388 extract_video_object(video)
1389 if expected_type is None:
1390 continue
1391 else:
1392 break
1393 return dict((k, v) for k, v in info.items() if v is not None)
1394
1395 @staticmethod
1396 def _hidden_inputs(html):
1397 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1398 hidden_inputs = {}
1399 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1400 attrs = extract_attributes(input)
1401 if not input:
1402 continue
1403 if attrs.get('type') not in ('hidden', 'submit'):
1404 continue
1405 name = attrs.get('name') or attrs.get('id')
1406 value = attrs.get('value')
1407 if name and value is not None:
1408 hidden_inputs[name] = value
1409 return hidden_inputs
1410
1411 def _form_hidden_inputs(self, form_id, html):
1412 form = self._search_regex(
1413 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1414 html, '%s form' % form_id, group='form')
1415 return self._hidden_inputs(form)
1416
1417 class FormatSort:
1418 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1419
1420 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1421 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1422 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
1423
1424 settings = {
1425 'vcodec': {'type': 'ordered', 'regex': True,
1426 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1427 'acodec': {'type': 'ordered', 'regex': True,
1428 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1429 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1430 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1431 'vext': {'type': 'ordered', 'field': 'video_ext',
1432 'order': ('mp4', 'webm', 'flv', '', 'none'),
1433 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1434 'aext': {'type': 'ordered', 'field': 'audio_ext',
1435 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1436 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1437 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1438 'ie_pref': {'priority': True, 'type': 'extractor'},
1439 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1440 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1441 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1442 'quality': {'convert': 'float_none'},
1443 'filesize': {'convert': 'bytes'},
1444 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1445 'id': {'convert': 'string', 'field': 'format_id'},
1446 'height': {'convert': 'float_none'},
1447 'width': {'convert': 'float_none'},
1448 'fps': {'convert': 'float_none'},
1449 'tbr': {'convert': 'float_none'},
1450 'vbr': {'convert': 'float_none'},
1451 'abr': {'convert': 'float_none'},
1452 'asr': {'convert': 'float_none'},
1453 'source': {'convert': 'ignore', 'field': 'source_preference'},
1454
1455 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1456 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1457 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1458 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1459 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1460
1461 # Most of these exist only for compatibility reasons
1462 'dimension': {'type': 'alias', 'field': 'res'},
1463 'resolution': {'type': 'alias', 'field': 'res'},
1464 'extension': {'type': 'alias', 'field': 'ext'},
1465 'bitrate': {'type': 'alias', 'field': 'br'},
1466 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1467 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1468 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1469 'framerate': {'type': 'alias', 'field': 'fps'},
1470 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1471 'protocol': {'type': 'alias', 'field': 'proto'},
1472 'source_preference': {'type': 'alias', 'field': 'source'},
1473 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1474 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1475 'samplerate': {'type': 'alias', 'field': 'asr'},
1476 'video_ext': {'type': 'alias', 'field': 'vext'},
1477 'audio_ext': {'type': 'alias', 'field': 'aext'},
1478 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1479 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1480 'video': {'type': 'alias', 'field': 'hasvid'},
1481 'has_video': {'type': 'alias', 'field': 'hasvid'},
1482 'audio': {'type': 'alias', 'field': 'hasaud'},
1483 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1484 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1485 'preference': {'type': 'alias', 'field': 'ie_pref'},
1486 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1487 'format_id': {'type': 'alias', 'field': 'id'},
1488 }
1489
1490 _order = []
1491
1492 def _get_field_setting(self, field, key):
1493 if field not in self.settings:
1494 self.settings[field] = {}
1495 propObj = self.settings[field]
1496 if key not in propObj:
1497 type = propObj.get('type')
1498 if key == 'field':
1499 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1500 elif key == 'convert':
1501 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1502 else:
1503 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1504 propObj[key] = default
1505 return propObj[key]
1506
1507 def _resolve_field_value(self, field, value, convertNone=False):
1508 if value is None:
1509 if not convertNone:
1510 return None
1511 else:
1512 value = value.lower()
1513 conversion = self._get_field_setting(field, 'convert')
1514 if conversion == 'ignore':
1515 return None
1516 if conversion == 'string':
1517 return value
1518 elif conversion == 'float_none':
1519 return float_or_none(value)
1520 elif conversion == 'bytes':
1521 return FileDownloader.parse_bytes(value)
1522 elif conversion == 'order':
1523 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1524 use_regex = self._get_field_setting(field, 'regex')
1525 list_length = len(order_list)
1526 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1527 if use_regex and value is not None:
1528 for i, regex in enumerate(order_list):
1529 if regex and re.match(regex, value):
1530 return list_length - i
1531 return list_length - empty_pos # not in list
1532 else: # not regex or value = None
1533 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1534 else:
1535 if value.isnumeric():
1536 return float(value)
1537 else:
1538 self.settings[field]['convert'] = 'string'
1539 return value
1540
1541 def evaluate_params(self, params, sort_extractor):
1542 self._use_free_order = params.get('prefer_free_formats', False)
1543 self._sort_user = params.get('format_sort', [])
1544 self._sort_extractor = sort_extractor
1545
1546 def add_item(field, reverse, closest, limit_text):
1547 field = field.lower()
1548 if field in self._order:
1549 return
1550 self._order.append(field)
1551 limit = self._resolve_field_value(field, limit_text)
1552 data = {
1553 'reverse': reverse,
1554 'closest': False if limit is None else closest,
1555 'limit_text': limit_text,
1556 'limit': limit}
1557 if field in self.settings:
1558 self.settings[field].update(data)
1559 else:
1560 self.settings[field] = data
1561
1562 sort_list = (
1563 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1564 + (tuple() if params.get('format_sort_force', False)
1565 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1566 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1567
1568 for item in sort_list:
1569 match = re.match(self.regex, item)
1570 if match is None:
1571 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1572 field = match.group('field')
1573 if field is None:
1574 continue
1575 if self._get_field_setting(field, 'type') == 'alias':
1576 field = self._get_field_setting(field, 'field')
1577 reverse = match.group('reverse') is not None
1578 closest = match.group('separator') == '~'
1579 limit_text = match.group('limit')
1580
1581 has_limit = limit_text is not None
1582 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1583 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1584
1585 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1586 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1587 limit_count = len(limits)
1588 for (i, f) in enumerate(fields):
1589 add_item(f, reverse, closest,
1590 limits[i] if i < limit_count
1591 else limits[0] if has_limit and not has_multiple_limits
1592 else None)
1593
1594 def print_verbose_info(self, to_screen):
1595 if self._sort_user:
1596 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1597 if self._sort_extractor:
1598 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1599 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1600 '+' if self._get_field_setting(field, 'reverse') else '', field,
1601 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1602 self._get_field_setting(field, 'limit_text'),
1603 self._get_field_setting(field, 'limit'))
1604 if self._get_field_setting(field, 'limit_text') is not None else '')
1605 for field in self._order if self._get_field_setting(field, 'visible')]))
1606
1607 def _calculate_field_preference_from_value(self, format, field, type, value):
1608 reverse = self._get_field_setting(field, 'reverse')
1609 closest = self._get_field_setting(field, 'closest')
1610 limit = self._get_field_setting(field, 'limit')
1611
1612 if type == 'extractor':
1613 maximum = self._get_field_setting(field, 'max')
1614 if value is None or (maximum is not None and value >= maximum):
1615 value = -1
1616 elif type == 'boolean':
1617 in_list = self._get_field_setting(field, 'in_list')
1618 not_in_list = self._get_field_setting(field, 'not_in_list')
1619 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1620 elif type == 'ordered':
1621 value = self._resolve_field_value(field, value, True)
1622
1623 # try to convert to number
1624 val_num = float_or_none(value)
1625 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1626 if is_num:
1627 value = val_num
1628
1629 return ((-10, 0) if value is None
1630 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1631 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1632 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1633 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1634 else (-1, value, 0))
1635
1636 def _calculate_field_preference(self, format, field):
1637 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1638 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1639 if type == 'multiple':
1640 type = 'field' # Only 'field' is allowed in multiple for now
1641 actual_fields = self._get_field_setting(field, 'field')
1642
1643 def wrapped_function(values):
1644 values = tuple(filter(lambda x: x is not None, values))
1645 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1646 else values[0] if values
1647 else None)
1648
1649 value = wrapped_function((get_value(f) for f in actual_fields))
1650 else:
1651 value = get_value(field)
1652 return self._calculate_field_preference_from_value(format, field, type, value)
1653
1654 def calculate_preference(self, format):
1655 # Determine missing protocol
1656 if not format.get('protocol'):
1657 format['protocol'] = determine_protocol(format)
1658
1659 # Determine missing ext
1660 if not format.get('ext') and 'url' in format:
1661 format['ext'] = determine_ext(format['url'])
1662 if format.get('vcodec') == 'none':
1663 format['audio_ext'] = format['ext']
1664 format['video_ext'] = 'none'
1665 else:
1666 format['video_ext'] = format['ext']
1667 format['audio_ext'] = 'none'
1668 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1669 # format['preference'] = -1000
1670
1671 # Determine missing bitrates
1672 if format.get('tbr') is None:
1673 if format.get('vbr') is not None and format.get('abr') is not None:
1674 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1675 else:
1676 if format.get('vcodec') != "none" and format.get('vbr') is None:
1677 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1678 if format.get('acodec') != "none" and format.get('abr') is None:
1679 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1680
1681 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1682
1683 def _sort_formats(self, formats, field_preference=[]):
1684 if not formats:
1685 if self._downloader.params.get('ignore_no_formats_error'):
1686 return
1687 raise ExtractorError('No video formats found')
1688 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1689 format_sort.evaluate_params(self._downloader.params, field_preference)
1690 if self._downloader.params.get('verbose', False):
1691 format_sort.print_verbose_info(self._downloader.to_screen)
1692 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1693
1694 def _check_formats(self, formats, video_id):
1695 if formats:
1696 formats[:] = filter(
1697 lambda f: self._is_valid_url(
1698 f['url'], video_id,
1699 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1700 formats)
1701
1702 @staticmethod
1703 def _remove_duplicate_formats(formats):
1704 format_urls = set()
1705 unique_formats = []
1706 for f in formats:
1707 if f['url'] not in format_urls:
1708 format_urls.add(f['url'])
1709 unique_formats.append(f)
1710 formats[:] = unique_formats
1711
1712 def _is_valid_url(self, url, video_id, item='video', headers={}):
1713 url = self._proto_relative_url(url, scheme='http:')
1714 # For now assume non HTTP(S) URLs always valid
1715 if not (url.startswith('http://') or url.startswith('https://')):
1716 return True
1717 try:
1718 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1719 return True
1720 except ExtractorError as e:
1721 self.to_screen(
1722 '%s: %s URL is invalid, skipping: %s'
1723 % (video_id, item, error_to_compat_str(e.cause)))
1724 return False
1725
1726 def http_scheme(self):
1727 """ Either "http:" or "https:", depending on the user's preferences """
1728 return (
1729 'http:'
1730 if self._downloader.params.get('prefer_insecure', False)
1731 else 'https:')
1732
1733 def _proto_relative_url(self, url, scheme=None):
1734 if url is None:
1735 return url
1736 if url.startswith('//'):
1737 if scheme is None:
1738 scheme = self.http_scheme()
1739 return scheme + url
1740 else:
1741 return url
1742
1743 def _sleep(self, timeout, video_id, msg_template=None):
1744 if msg_template is None:
1745 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1746 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1747 self.to_screen(msg)
1748 time.sleep(timeout)
1749
1750 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1751 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1752 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1753 manifest = self._download_xml(
1754 manifest_url, video_id, 'Downloading f4m manifest',
1755 'Unable to download f4m manifest',
1756 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1757 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1758 transform_source=transform_source,
1759 fatal=fatal, data=data, headers=headers, query=query)
1760
1761 if manifest is False:
1762 return []
1763
1764 return self._parse_f4m_formats(
1765 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1766 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1767
1768 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1769 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1770 fatal=True, m3u8_id=None):
1771 if not isinstance(manifest, compat_etree_Element) and not fatal:
1772 return []
1773
1774 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1775 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1776 if akamai_pv is not None and ';' in akamai_pv.text:
1777 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1778 if playerVerificationChallenge.strip() != '':
1779 return []
1780
1781 formats = []
1782 manifest_version = '1.0'
1783 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1784 if not media_nodes:
1785 manifest_version = '2.0'
1786 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1787 # Remove unsupported DRM protected media from final formats
1788 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1789 media_nodes = remove_encrypted_media(media_nodes)
1790 if not media_nodes:
1791 return formats
1792
1793 manifest_base_url = get_base_url(manifest)
1794
1795 bootstrap_info = xpath_element(
1796 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1797 'bootstrap info', default=None)
1798
1799 vcodec = None
1800 mime_type = xpath_text(
1801 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1802 'base URL', default=None)
1803 if mime_type and mime_type.startswith('audio/'):
1804 vcodec = 'none'
1805
1806 for i, media_el in enumerate(media_nodes):
1807 tbr = int_or_none(media_el.attrib.get('bitrate'))
1808 width = int_or_none(media_el.attrib.get('width'))
1809 height = int_or_none(media_el.attrib.get('height'))
1810 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1811 # If <bootstrapInfo> is present, the specified f4m is a
1812 # stream-level manifest, and only set-level manifests may refer to
1813 # external resources. See section 11.4 and section 4 of F4M spec
1814 if bootstrap_info is None:
1815 media_url = None
1816 # @href is introduced in 2.0, see section 11.6 of F4M spec
1817 if manifest_version == '2.0':
1818 media_url = media_el.attrib.get('href')
1819 if media_url is None:
1820 media_url = media_el.attrib.get('url')
1821 if not media_url:
1822 continue
1823 manifest_url = (
1824 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1825 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1826 # If media_url is itself a f4m manifest do the recursive extraction
1827 # since bitrates in parent manifest (this one) and media_url manifest
1828 # may differ leading to inability to resolve the format by requested
1829 # bitrate in f4m downloader
1830 ext = determine_ext(manifest_url)
1831 if ext == 'f4m':
1832 f4m_formats = self._extract_f4m_formats(
1833 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1834 transform_source=transform_source, fatal=fatal)
1835 # Sometimes stream-level manifest contains single media entry that
1836 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1837 # At the same time parent's media entry in set-level manifest may
1838 # contain it. We will copy it from parent in such cases.
1839 if len(f4m_formats) == 1:
1840 f = f4m_formats[0]
1841 f.update({
1842 'tbr': f.get('tbr') or tbr,
1843 'width': f.get('width') or width,
1844 'height': f.get('height') or height,
1845 'format_id': f.get('format_id') if not tbr else format_id,
1846 'vcodec': vcodec,
1847 })
1848 formats.extend(f4m_formats)
1849 continue
1850 elif ext == 'm3u8':
1851 formats.extend(self._extract_m3u8_formats(
1852 manifest_url, video_id, 'mp4', preference=preference,
1853 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1854 continue
1855 formats.append({
1856 'format_id': format_id,
1857 'url': manifest_url,
1858 'manifest_url': manifest_url,
1859 'ext': 'flv' if bootstrap_info is not None else None,
1860 'protocol': 'f4m',
1861 'tbr': tbr,
1862 'width': width,
1863 'height': height,
1864 'vcodec': vcodec,
1865 'preference': preference,
1866 'quality': quality,
1867 })
1868 return formats
1869
1870 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1871 return {
1872 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1873 'url': m3u8_url,
1874 'ext': ext,
1875 'protocol': 'm3u8',
1876 'preference': preference - 100 if preference else -100,
1877 'quality': quality,
1878 'resolution': 'multiple',
1879 'format_note': 'Quality selection URL',
1880 }
1881
1882 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1883 entry_protocol='m3u8', preference=None, quality=None,
1884 m3u8_id=None, note=None, errnote=None,
1885 fatal=True, live=False, data=None, headers={},
1886 query={}):
1887 res = self._download_webpage_handle(
1888 m3u8_url, video_id,
1889 note=note or 'Downloading m3u8 information',
1890 errnote=errnote or 'Failed to download m3u8 information',
1891 fatal=fatal, data=data, headers=headers, query=query)
1892
1893 if res is False:
1894 return []
1895
1896 m3u8_doc, urlh = res
1897 m3u8_url = urlh.geturl()
1898
1899 return self._parse_m3u8_formats(
1900 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1901 preference=preference, quality=quality, m3u8_id=m3u8_id,
1902 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1903 headers=headers, query=query, video_id=video_id)
1904
1905 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1906 entry_protocol='m3u8', preference=None, quality=None,
1907 m3u8_id=None, live=False, note=None, errnote=None,
1908 fatal=True, data=None, headers={}, query={}, video_id=None):
1909 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1910 return []
1911
1912 if (not self._downloader.params.get('allow_unplayable_formats')
1913 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
1914 return []
1915
1916 formats = []
1917
1918 format_url = lambda u: (
1919 u
1920 if re.match(r'^https?://', u)
1921 else compat_urlparse.urljoin(m3u8_url, u))
1922
1923 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1924
1925 # References:
1926 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1927 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1928 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1929
1930 # We should try extracting formats only from master playlists [1, 4.3.4],
1931 # i.e. playlists that describe available qualities. On the other hand
1932 # media playlists [1, 4.3.3] should be returned as is since they contain
1933 # just the media without qualities renditions.
1934 # Fortunately, master playlist can be easily distinguished from media
1935 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1936 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1937 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1938 # media playlist and MUST NOT appear in master playlist thus we can
1939 # clearly detect media playlist with this criterion.
1940
1941 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1942 fatal=True, data=None, headers={}):
1943 if not m3u8_doc:
1944 if not format_url:
1945 return []
1946 res = self._download_webpage_handle(
1947 format_url, video_id,
1948 note=False,
1949 errnote='Failed to download m3u8 playlist information',
1950 fatal=fatal, data=data, headers=headers)
1951
1952 if res is False:
1953 return []
1954
1955 m3u8_doc, urlh = res
1956 format_url = urlh.geturl()
1957
1958 playlist_formats = []
1959 i = (
1960 0
1961 if split_discontinuity
1962 else None)
1963 format_info = {
1964 'index': i,
1965 'key_data': None,
1966 'files': [],
1967 }
1968 for line in m3u8_doc.splitlines():
1969 if not line.startswith('#'):
1970 format_info['files'].append(line)
1971 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1972 i += 1
1973 playlist_formats.append(format_info)
1974 format_info = {
1975 'index': i,
1976 'url': format_url,
1977 'files': [],
1978 }
1979 playlist_formats.append(format_info)
1980 return playlist_formats
1981
1982 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1983
1984 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1985
1986 for format in playlist_formats:
1987 format_id = []
1988 if m3u8_id:
1989 format_id.append(m3u8_id)
1990 format_index = format.get('index')
1991 if format_index:
1992 format_id.append(str(format_index))
1993 f = {
1994 'format_id': '-'.join(format_id),
1995 'format_index': format_index,
1996 'url': m3u8_url,
1997 'ext': ext,
1998 'protocol': entry_protocol,
1999 'preference': preference,
2000 'quality': quality,
2001 }
2002 formats.append(f)
2003
2004 return formats
2005
2006 groups = {}
2007 last_stream_inf = {}
2008
2009 def extract_media(x_media_line):
2010 media = parse_m3u8_attributes(x_media_line)
2011 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2012 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2013 if not (media_type and group_id and name):
2014 return
2015 groups.setdefault(group_id, []).append(media)
2016 if media_type not in ('VIDEO', 'AUDIO'):
2017 return
2018 media_url = media.get('URI')
2019 if media_url:
2020 manifest_url = format_url(media_url)
2021 format_id = []
2022 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2023 fatal=fatal, data=data, headers=headers)
2024
2025 for format in playlist_formats:
2026 format_index = format.get('index')
2027 for v in (m3u8_id, group_id, name):
2028 if v:
2029 format_id.append(v)
2030 if format_index:
2031 format_id.append(str(format_index))
2032 f = {
2033 'format_id': '-'.join(format_id),
2034 'format_index': format_index,
2035 'url': manifest_url,
2036 'manifest_url': m3u8_url,
2037 'language': media.get('LANGUAGE'),
2038 'ext': ext,
2039 'protocol': entry_protocol,
2040 'preference': preference,
2041 'quality': quality,
2042 }
2043 if media_type == 'AUDIO':
2044 f['vcodec'] = 'none'
2045 formats.append(f)
2046
2047 def build_stream_name():
2048 # Despite specification does not mention NAME attribute for
2049 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2050 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2051 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2052 stream_name = last_stream_inf.get('NAME')
2053 if stream_name:
2054 return stream_name
2055 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2056 # from corresponding rendition group
2057 stream_group_id = last_stream_inf.get('VIDEO')
2058 if not stream_group_id:
2059 return
2060 stream_group = groups.get(stream_group_id)
2061 if not stream_group:
2062 return stream_group_id
2063 rendition = stream_group[0]
2064 return rendition.get('NAME') or stream_group_id
2065
2066 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2067 # chance to detect video only formats when EXT-X-STREAM-INF tags
2068 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2069 for line in m3u8_doc.splitlines():
2070 if line.startswith('#EXT-X-MEDIA:'):
2071 extract_media(line)
2072
2073 for line in m3u8_doc.splitlines():
2074 if line.startswith('#EXT-X-STREAM-INF:'):
2075 last_stream_inf = parse_m3u8_attributes(line)
2076 elif line.startswith('#') or not line.strip():
2077 continue
2078 else:
2079 tbr = float_or_none(
2080 last_stream_inf.get('AVERAGE-BANDWIDTH')
2081 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2082 manifest_url = format_url(line.strip())
2083
2084 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2085 fatal=fatal, data=data, headers=headers)
2086
2087 for frmt in playlist_formats:
2088 format_id = []
2089 if m3u8_id:
2090 format_id.append(m3u8_id)
2091 format_index = frmt.get('index')
2092 stream_name = build_stream_name()
2093 # Bandwidth of live streams may differ over time thus making
2094 # format_id unpredictable. So it's better to keep provided
2095 # format_id intact.
2096 if not live:
2097 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2098 if format_index:
2099 format_id.append(str(format_index))
2100 f = {
2101 'format_id': '-'.join(format_id),
2102 'format_index': format_index,
2103 'url': manifest_url,
2104 'manifest_url': m3u8_url,
2105 'tbr': tbr,
2106 'ext': ext,
2107 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2108 'protocol': entry_protocol,
2109 'preference': preference,
2110 'quality': quality,
2111 }
2112 resolution = last_stream_inf.get('RESOLUTION')
2113 if resolution:
2114 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2115 if mobj:
2116 f['width'] = int(mobj.group('width'))
2117 f['height'] = int(mobj.group('height'))
2118 # Unified Streaming Platform
2119 mobj = re.search(
2120 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2121 if mobj:
2122 abr, vbr = mobj.groups()
2123 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2124 f.update({
2125 'vbr': vbr,
2126 'abr': abr,
2127 })
2128 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2129 f.update(codecs)
2130 audio_group_id = last_stream_inf.get('AUDIO')
2131 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2132 # references a rendition group MUST have a CODECS attribute.
2133 # However, this is not always respected, for example, [2]
2134 # contains EXT-X-STREAM-INF tag which references AUDIO
2135 # rendition group but does not have CODECS and despite
2136 # referencing an audio group it represents a complete
2137 # (with audio and video) format. So, for such cases we will
2138 # ignore references to rendition groups and treat them
2139 # as complete formats.
2140 if audio_group_id and codecs and f.get('vcodec') != 'none':
2141 audio_group = groups.get(audio_group_id)
2142 if audio_group and audio_group[0].get('URI'):
2143 # TODO: update acodec for audio only formats with
2144 # the same GROUP-ID
2145 f['acodec'] = 'none'
2146 if not f.get('ext'):
2147 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2148 formats.append(f)
2149
2150 # for DailyMotion
2151 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2152 if progressive_uri:
2153 http_f = f.copy()
2154 del http_f['manifest_url']
2155 http_f.update({
2156 'format_id': f['format_id'].replace('hls-', 'http-'),
2157 'protocol': 'http',
2158 'url': progressive_uri,
2159 })
2160 formats.append(http_f)
2161
2162 last_stream_inf = {}
2163 return formats
2164
2165 @staticmethod
2166 def _xpath_ns(path, namespace=None):
2167 if not namespace:
2168 return path
2169 out = []
2170 for c in path.split('/'):
2171 if not c or c == '.':
2172 out.append(c)
2173 else:
2174 out.append('{%s}%s' % (namespace, c))
2175 return '/'.join(out)
2176
2177 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2178 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2179
2180 if smil is False:
2181 assert not fatal
2182 return []
2183
2184 namespace = self._parse_smil_namespace(smil)
2185
2186 return self._parse_smil_formats(
2187 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2188
2189 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2190 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2191 if smil is False:
2192 return {}
2193 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2194
2195 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2196 return self._download_xml(
2197 smil_url, video_id, 'Downloading SMIL file',
2198 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2199
2200 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2201 namespace = self._parse_smil_namespace(smil)
2202
2203 formats = self._parse_smil_formats(
2204 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2205 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2206
2207 video_id = os.path.splitext(url_basename(smil_url))[0]
2208 title = None
2209 description = None
2210 upload_date = None
2211 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2212 name = meta.attrib.get('name')
2213 content = meta.attrib.get('content')
2214 if not name or not content:
2215 continue
2216 if not title and name == 'title':
2217 title = content
2218 elif not description and name in ('description', 'abstract'):
2219 description = content
2220 elif not upload_date and name == 'date':
2221 upload_date = unified_strdate(content)
2222
2223 thumbnails = [{
2224 'id': image.get('type'),
2225 'url': image.get('src'),
2226 'width': int_or_none(image.get('width')),
2227 'height': int_or_none(image.get('height')),
2228 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2229
2230 return {
2231 'id': video_id,
2232 'title': title or video_id,
2233 'description': description,
2234 'upload_date': upload_date,
2235 'thumbnails': thumbnails,
2236 'formats': formats,
2237 'subtitles': subtitles,
2238 }
2239
2240 def _parse_smil_namespace(self, smil):
2241 return self._search_regex(
2242 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2243
2244 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2245 base = smil_url
2246 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2247 b = meta.get('base') or meta.get('httpBase')
2248 if b:
2249 base = b
2250 break
2251
2252 formats = []
2253 rtmp_count = 0
2254 http_count = 0
2255 m3u8_count = 0
2256
2257 srcs = []
2258 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2259 for medium in media:
2260 src = medium.get('src')
2261 if not src or src in srcs:
2262 continue
2263 srcs.append(src)
2264
2265 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2266 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2267 width = int_or_none(medium.get('width'))
2268 height = int_or_none(medium.get('height'))
2269 proto = medium.get('proto')
2270 ext = medium.get('ext')
2271 src_ext = determine_ext(src)
2272 streamer = medium.get('streamer') or base
2273
2274 if proto == 'rtmp' or streamer.startswith('rtmp'):
2275 rtmp_count += 1
2276 formats.append({
2277 'url': streamer,
2278 'play_path': src,
2279 'ext': 'flv',
2280 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2281 'tbr': bitrate,
2282 'filesize': filesize,
2283 'width': width,
2284 'height': height,
2285 })
2286 if transform_rtmp_url:
2287 streamer, src = transform_rtmp_url(streamer, src)
2288 formats[-1].update({
2289 'url': streamer,
2290 'play_path': src,
2291 })
2292 continue
2293
2294 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2295 src_url = src_url.strip()
2296
2297 if proto == 'm3u8' or src_ext == 'm3u8':
2298 m3u8_formats = self._extract_m3u8_formats(
2299 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2300 if len(m3u8_formats) == 1:
2301 m3u8_count += 1
2302 m3u8_formats[0].update({
2303 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2304 'tbr': bitrate,
2305 'width': width,
2306 'height': height,
2307 })
2308 formats.extend(m3u8_formats)
2309 elif src_ext == 'f4m':
2310 f4m_url = src_url
2311 if not f4m_params:
2312 f4m_params = {
2313 'hdcore': '3.2.0',
2314 'plugin': 'flowplayer-3.2.0.1',
2315 }
2316 f4m_url += '&' if '?' in f4m_url else '?'
2317 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2318 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2319 elif src_ext == 'mpd':
2320 formats.extend(self._extract_mpd_formats(
2321 src_url, video_id, mpd_id='dash', fatal=False))
2322 elif re.search(r'\.ism/[Mm]anifest', src_url):
2323 formats.extend(self._extract_ism_formats(
2324 src_url, video_id, ism_id='mss', fatal=False))
2325 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2326 http_count += 1
2327 formats.append({
2328 'url': src_url,
2329 'ext': ext or src_ext or 'flv',
2330 'format_id': 'http-%d' % (bitrate or http_count),
2331 'tbr': bitrate,
2332 'filesize': filesize,
2333 'width': width,
2334 'height': height,
2335 })
2336
2337 return formats
2338
2339 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2340 urls = []
2341 subtitles = {}
2342 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2343 src = textstream.get('src')
2344 if not src or src in urls:
2345 continue
2346 urls.append(src)
2347 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2348 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2349 subtitles.setdefault(lang, []).append({
2350 'url': src,
2351 'ext': ext,
2352 })
2353 return subtitles
2354
2355 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2356 xspf = self._download_xml(
2357 xspf_url, playlist_id, 'Downloading xpsf playlist',
2358 'Unable to download xspf manifest', fatal=fatal)
2359 if xspf is False:
2360 return []
2361 return self._parse_xspf(
2362 xspf, playlist_id, xspf_url=xspf_url,
2363 xspf_base_url=base_url(xspf_url))
2364
2365 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2366 NS_MAP = {
2367 'xspf': 'http://xspf.org/ns/0/',
2368 's1': 'http://static.streamone.nl/player/ns/0',
2369 }
2370
2371 entries = []
2372 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2373 title = xpath_text(
2374 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2375 description = xpath_text(
2376 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2377 thumbnail = xpath_text(
2378 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2379 duration = float_or_none(
2380 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2381
2382 formats = []
2383 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2384 format_url = urljoin(xspf_base_url, location.text)
2385 if not format_url:
2386 continue
2387 formats.append({
2388 'url': format_url,
2389 'manifest_url': xspf_url,
2390 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2391 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2392 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2393 })
2394 self._sort_formats(formats)
2395
2396 entries.append({
2397 'id': playlist_id,
2398 'title': title,
2399 'description': description,
2400 'thumbnail': thumbnail,
2401 'duration': duration,
2402 'formats': formats,
2403 })
2404 return entries
2405
2406 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2407 res = self._download_xml_handle(
2408 mpd_url, video_id,
2409 note=note or 'Downloading MPD manifest',
2410 errnote=errnote or 'Failed to download MPD manifest',
2411 fatal=fatal, data=data, headers=headers, query=query)
2412 if res is False:
2413 return []
2414 mpd_doc, urlh = res
2415 if mpd_doc is None:
2416 return []
2417 mpd_base_url = base_url(urlh.geturl())
2418
2419 return self._parse_mpd_formats(
2420 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2421
2422 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2423 """
2424 Parse formats from MPD manifest.
2425 References:
2426 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2427 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2428 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2429 """
2430 if not self._downloader.params.get('dynamic_mpd', True):
2431 if mpd_doc.get('type') == 'dynamic':
2432 return []
2433
2434 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2435
2436 def _add_ns(path):
2437 return self._xpath_ns(path, namespace)
2438
2439 def is_drm_protected(element):
2440 return element.find(_add_ns('ContentProtection')) is not None
2441
2442 def extract_multisegment_info(element, ms_parent_info):
2443 ms_info = ms_parent_info.copy()
2444
2445 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2446 # common attributes and elements. We will only extract relevant
2447 # for us.
2448 def extract_common(source):
2449 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2450 if segment_timeline is not None:
2451 s_e = segment_timeline.findall(_add_ns('S'))
2452 if s_e:
2453 ms_info['total_number'] = 0
2454 ms_info['s'] = []
2455 for s in s_e:
2456 r = int(s.get('r', 0))
2457 ms_info['total_number'] += 1 + r
2458 ms_info['s'].append({
2459 't': int(s.get('t', 0)),
2460 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2461 'd': int(s.attrib['d']),
2462 'r': r,
2463 })
2464 start_number = source.get('startNumber')
2465 if start_number:
2466 ms_info['start_number'] = int(start_number)
2467 timescale = source.get('timescale')
2468 if timescale:
2469 ms_info['timescale'] = int(timescale)
2470 segment_duration = source.get('duration')
2471 if segment_duration:
2472 ms_info['segment_duration'] = float(segment_duration)
2473
2474 def extract_Initialization(source):
2475 initialization = source.find(_add_ns('Initialization'))
2476 if initialization is not None:
2477 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2478
2479 segment_list = element.find(_add_ns('SegmentList'))
2480 if segment_list is not None:
2481 extract_common(segment_list)
2482 extract_Initialization(segment_list)
2483 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2484 if segment_urls_e:
2485 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2486 else:
2487 segment_template = element.find(_add_ns('SegmentTemplate'))
2488 if segment_template is not None:
2489 extract_common(segment_template)
2490 media = segment_template.get('media')
2491 if media:
2492 ms_info['media'] = media
2493 initialization = segment_template.get('initialization')
2494 if initialization:
2495 ms_info['initialization'] = initialization
2496 else:
2497 extract_Initialization(segment_template)
2498 return ms_info
2499
2500 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2501
2502 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2503 formats = []
2504 for period in mpd_doc.findall(_add_ns('Period')):
2505 period_duration = parse_duration(period.get('duration')) or mpd_duration
2506 period_ms_info = extract_multisegment_info(period, {
2507 'start_number': 1,
2508 'timescale': 1,
2509 })
2510 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2511 if skip_unplayable and is_drm_protected(adaptation_set):
2512 continue
2513 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2514 for representation in adaptation_set.findall(_add_ns('Representation')):
2515 if skip_unplayable and is_drm_protected(representation):
2516 continue
2517 representation_attrib = adaptation_set.attrib.copy()
2518 representation_attrib.update(representation.attrib)
2519 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2520 mime_type = representation_attrib['mimeType']
2521 content_type = mime_type.split('/')[0]
2522 if content_type == 'text':
2523 # TODO implement WebVTT downloading
2524 pass
2525 elif content_type in ('video', 'audio'):
2526 base_url = ''
2527 for element in (representation, adaptation_set, period, mpd_doc):
2528 base_url_e = element.find(_add_ns('BaseURL'))
2529 if base_url_e is not None:
2530 base_url = base_url_e.text + base_url
2531 if re.match(r'^https?://', base_url):
2532 break
2533 if mpd_base_url and not re.match(r'^https?://', base_url):
2534 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2535 mpd_base_url += '/'
2536 base_url = mpd_base_url + base_url
2537 representation_id = representation_attrib.get('id')
2538 lang = representation_attrib.get('lang')
2539 url_el = representation.find(_add_ns('BaseURL'))
2540 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2541 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2542 f = {
2543 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2544 'manifest_url': mpd_url,
2545 'ext': mimetype2ext(mime_type),
2546 'width': int_or_none(representation_attrib.get('width')),
2547 'height': int_or_none(representation_attrib.get('height')),
2548 'tbr': float_or_none(bandwidth, 1000),
2549 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2550 'fps': int_or_none(representation_attrib.get('frameRate')),
2551 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2552 'format_note': 'DASH %s' % content_type,
2553 'filesize': filesize,
2554 'container': mimetype2ext(mime_type) + '_dash',
2555 }
2556 f.update(parse_codecs(representation_attrib.get('codecs')))
2557 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2558
2559 def prepare_template(template_name, identifiers):
2560 tmpl = representation_ms_info[template_name]
2561 # First of, % characters outside $...$ templates
2562 # must be escaped by doubling for proper processing
2563 # by % operator string formatting used further (see
2564 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2565 t = ''
2566 in_template = False
2567 for c in tmpl:
2568 t += c
2569 if c == '$':
2570 in_template = not in_template
2571 elif c == '%' and not in_template:
2572 t += c
2573 # Next, $...$ templates are translated to their
2574 # %(...) counterparts to be used with % operator
2575 t = t.replace('$RepresentationID$', representation_id)
2576 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2577 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2578 t.replace('$$', '$')
2579 return t
2580
2581 # @initialization is a regular template like @media one
2582 # so it should be handled just the same way (see
2583 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2584 if 'initialization' in representation_ms_info:
2585 initialization_template = prepare_template(
2586 'initialization',
2587 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2588 # $Time$ shall not be included for @initialization thus
2589 # only $Bandwidth$ remains
2590 ('Bandwidth', ))
2591 representation_ms_info['initialization_url'] = initialization_template % {
2592 'Bandwidth': bandwidth,
2593 }
2594
2595 def location_key(location):
2596 return 'url' if re.match(r'^https?://', location) else 'path'
2597
2598 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2599
2600 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2601 media_location_key = location_key(media_template)
2602
2603 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2604 # can't be used at the same time
2605 if '%(Number' in media_template and 's' not in representation_ms_info:
2606 segment_duration = None
2607 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2608 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2609 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2610 representation_ms_info['fragments'] = [{
2611 media_location_key: media_template % {
2612 'Number': segment_number,
2613 'Bandwidth': bandwidth,
2614 },
2615 'duration': segment_duration,
2616 } for segment_number in range(
2617 representation_ms_info['start_number'],
2618 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2619 else:
2620 # $Number*$ or $Time$ in media template with S list available
2621 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2622 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2623 representation_ms_info['fragments'] = []
2624 segment_time = 0
2625 segment_d = None
2626 segment_number = representation_ms_info['start_number']
2627
2628 def add_segment_url():
2629 segment_url = media_template % {
2630 'Time': segment_time,
2631 'Bandwidth': bandwidth,
2632 'Number': segment_number,
2633 }
2634 representation_ms_info['fragments'].append({
2635 media_location_key: segment_url,
2636 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2637 })
2638
2639 for num, s in enumerate(representation_ms_info['s']):
2640 segment_time = s.get('t') or segment_time
2641 segment_d = s['d']
2642 add_segment_url()
2643 segment_number += 1
2644 for r in range(s.get('r', 0)):
2645 segment_time += segment_d
2646 add_segment_url()
2647 segment_number += 1
2648 segment_time += segment_d
2649 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2650 # No media template
2651 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2652 # or any YouTube dashsegments video
2653 fragments = []
2654 segment_index = 0
2655 timescale = representation_ms_info['timescale']
2656 for s in representation_ms_info['s']:
2657 duration = float_or_none(s['d'], timescale)
2658 for r in range(s.get('r', 0) + 1):
2659 segment_uri = representation_ms_info['segment_urls'][segment_index]
2660 fragments.append({
2661 location_key(segment_uri): segment_uri,
2662 'duration': duration,
2663 })
2664 segment_index += 1
2665 representation_ms_info['fragments'] = fragments
2666 elif 'segment_urls' in representation_ms_info:
2667 # Segment URLs with no SegmentTimeline
2668 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2669 # https://github.com/ytdl-org/youtube-dl/pull/14844
2670 fragments = []
2671 segment_duration = float_or_none(
2672 representation_ms_info['segment_duration'],
2673 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2674 for segment_url in representation_ms_info['segment_urls']:
2675 fragment = {
2676 location_key(segment_url): segment_url,
2677 }
2678 if segment_duration:
2679 fragment['duration'] = segment_duration
2680 fragments.append(fragment)
2681 representation_ms_info['fragments'] = fragments
2682 # If there is a fragments key available then we correctly recognized fragmented media.
2683 # Otherwise we will assume unfragmented media with direct access. Technically, such
2684 # assumption is not necessarily correct since we may simply have no support for
2685 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2686 if 'fragments' in representation_ms_info:
2687 f.update({
2688 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2689 'url': mpd_url or base_url,
2690 'fragment_base_url': base_url,
2691 'fragments': [],
2692 'protocol': 'http_dash_segments',
2693 })
2694 if 'initialization_url' in representation_ms_info:
2695 initialization_url = representation_ms_info['initialization_url']
2696 if not f.get('url'):
2697 f['url'] = initialization_url
2698 f['fragments'].append({location_key(initialization_url): initialization_url})
2699 f['fragments'].extend(representation_ms_info['fragments'])
2700 else:
2701 # Assuming direct URL to unfragmented media.
2702 f['url'] = base_url
2703 formats.append(f)
2704 else:
2705 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2706 return formats
2707
2708 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2709 res = self._download_xml_handle(
2710 ism_url, video_id,
2711 note=note or 'Downloading ISM manifest',
2712 errnote=errnote or 'Failed to download ISM manifest',
2713 fatal=fatal, data=data, headers=headers, query=query)
2714 if res is False:
2715 return []
2716 ism_doc, urlh = res
2717 if ism_doc is None:
2718 return []
2719
2720 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2721
2722 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2723 """
2724 Parse formats from ISM manifest.
2725 References:
2726 1. [MS-SSTR]: Smooth Streaming Protocol,
2727 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2728 """
2729 if ism_doc.get('IsLive') == 'TRUE':
2730 return []
2731 if (not self._downloader.params.get('allow_unplayable_formats')
2732 and ism_doc.find('Protection') is not None):
2733 return []
2734
2735 duration = int(ism_doc.attrib['Duration'])
2736 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2737
2738 formats = []
2739 for stream in ism_doc.findall('StreamIndex'):
2740 stream_type = stream.get('Type')
2741 if stream_type not in ('video', 'audio'):
2742 continue
2743 url_pattern = stream.attrib['Url']
2744 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2745 stream_name = stream.get('Name')
2746 for track in stream.findall('QualityLevel'):
2747 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2748 # TODO: add support for WVC1 and WMAP
2749 if fourcc not in ('H264', 'AVC1', 'AACL'):
2750 self.report_warning('%s is not a supported codec' % fourcc)
2751 continue
2752 tbr = int(track.attrib['Bitrate']) // 1000
2753 # [1] does not mention Width and Height attributes. However,
2754 # they're often present while MaxWidth and MaxHeight are
2755 # missing, so should be used as fallbacks
2756 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2757 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2758 sampling_rate = int_or_none(track.get('SamplingRate'))
2759
2760 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2761 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2762
2763 fragments = []
2764 fragment_ctx = {
2765 'time': 0,
2766 }
2767 stream_fragments = stream.findall('c')
2768 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2769 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2770 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2771 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2772 if not fragment_ctx['duration']:
2773 try:
2774 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2775 except IndexError:
2776 next_fragment_time = duration
2777 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2778 for _ in range(fragment_repeat):
2779 fragments.append({
2780 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2781 'duration': fragment_ctx['duration'] / stream_timescale,
2782 })
2783 fragment_ctx['time'] += fragment_ctx['duration']
2784
2785 format_id = []
2786 if ism_id:
2787 format_id.append(ism_id)
2788 if stream_name:
2789 format_id.append(stream_name)
2790 format_id.append(compat_str(tbr))
2791
2792 formats.append({
2793 'format_id': '-'.join(format_id),
2794 'url': ism_url,
2795 'manifest_url': ism_url,
2796 'ext': 'ismv' if stream_type == 'video' else 'isma',
2797 'width': width,
2798 'height': height,
2799 'tbr': tbr,
2800 'asr': sampling_rate,
2801 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2802 'acodec': 'none' if stream_type == 'video' else fourcc,
2803 'protocol': 'ism',
2804 'fragments': fragments,
2805 '_download_params': {
2806 'duration': duration,
2807 'timescale': stream_timescale,
2808 'width': width or 0,
2809 'height': height or 0,
2810 'fourcc': fourcc,
2811 'codec_private_data': track.get('CodecPrivateData'),
2812 'sampling_rate': sampling_rate,
2813 'channels': int_or_none(track.get('Channels', 2)),
2814 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2815 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2816 },
2817 })
2818 return formats
2819
2820 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2821 def absolute_url(item_url):
2822 return urljoin(base_url, item_url)
2823
2824 def parse_content_type(content_type):
2825 if not content_type:
2826 return {}
2827 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2828 if ctr:
2829 mimetype, codecs = ctr.groups()
2830 f = parse_codecs(codecs)
2831 f['ext'] = mimetype2ext(mimetype)
2832 return f
2833 return {}
2834
2835 def _media_formats(src, cur_media_type, type_info={}):
2836 full_url = absolute_url(src)
2837 ext = type_info.get('ext') or determine_ext(full_url)
2838 if ext == 'm3u8':
2839 is_plain_url = False
2840 formats = self._extract_m3u8_formats(
2841 full_url, video_id, ext='mp4',
2842 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2843 preference=preference, quality=quality, fatal=False)
2844 elif ext == 'mpd':
2845 is_plain_url = False
2846 formats = self._extract_mpd_formats(
2847 full_url, video_id, mpd_id=mpd_id, fatal=False)
2848 else:
2849 is_plain_url = True
2850 formats = [{
2851 'url': full_url,
2852 'vcodec': 'none' if cur_media_type == 'audio' else None,
2853 }]
2854 return is_plain_url, formats
2855
2856 entries = []
2857 # amp-video and amp-audio are very similar to their HTML5 counterparts
2858 # so we wll include them right here (see
2859 # https://www.ampproject.org/docs/reference/components/amp-video)
2860 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2861 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2862 media_tags = [(media_tag, media_tag_name, media_type, '')
2863 for media_tag, media_tag_name, media_type
2864 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2865 media_tags.extend(re.findall(
2866 # We only allow video|audio followed by a whitespace or '>'.
2867 # Allowing more characters may end up in significant slow down (see
2868 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2869 # http://www.porntrex.com/maps/videositemap.xml).
2870 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2871 for media_tag, _, media_type, media_content in media_tags:
2872 media_info = {
2873 'formats': [],
2874 'subtitles': {},
2875 }
2876 media_attributes = extract_attributes(media_tag)
2877 src = strip_or_none(media_attributes.get('src'))
2878 if src:
2879 _, formats = _media_formats(src, media_type)
2880 media_info['formats'].extend(formats)
2881 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2882 if media_content:
2883 for source_tag in re.findall(r'<source[^>]+>', media_content):
2884 s_attr = extract_attributes(source_tag)
2885 # data-video-src and data-src are non standard but seen
2886 # several times in the wild
2887 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2888 if not src:
2889 continue
2890 f = parse_content_type(s_attr.get('type'))
2891 is_plain_url, formats = _media_formats(src, media_type, f)
2892 if is_plain_url:
2893 # width, height, res, label and title attributes are
2894 # all not standard but seen several times in the wild
2895 labels = [
2896 s_attr.get(lbl)
2897 for lbl in ('label', 'title')
2898 if str_or_none(s_attr.get(lbl))
2899 ]
2900 width = int_or_none(s_attr.get('width'))
2901 height = (int_or_none(s_attr.get('height'))
2902 or int_or_none(s_attr.get('res')))
2903 if not width or not height:
2904 for lbl in labels:
2905 resolution = parse_resolution(lbl)
2906 if not resolution:
2907 continue
2908 width = width or resolution.get('width')
2909 height = height or resolution.get('height')
2910 for lbl in labels:
2911 tbr = parse_bitrate(lbl)
2912 if tbr:
2913 break
2914 else:
2915 tbr = None
2916 f.update({
2917 'width': width,
2918 'height': height,
2919 'tbr': tbr,
2920 'format_id': s_attr.get('label') or s_attr.get('title'),
2921 })
2922 f.update(formats[0])
2923 media_info['formats'].append(f)
2924 else:
2925 media_info['formats'].extend(formats)
2926 for track_tag in re.findall(r'<track[^>]+>', media_content):
2927 track_attributes = extract_attributes(track_tag)
2928 kind = track_attributes.get('kind')
2929 if not kind or kind in ('subtitles', 'captions'):
2930 src = strip_or_none(track_attributes.get('src'))
2931 if not src:
2932 continue
2933 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2934 media_info['subtitles'].setdefault(lang, []).append({
2935 'url': absolute_url(src),
2936 })
2937 for f in media_info['formats']:
2938 f.setdefault('http_headers', {})['Referer'] = base_url
2939 if media_info['formats'] or media_info['subtitles']:
2940 entries.append(media_info)
2941 return entries
2942
2943 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2944 signed = 'hdnea=' in manifest_url
2945 if not signed:
2946 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2947 manifest_url = re.sub(
2948 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2949 '', manifest_url).strip('?')
2950
2951 formats = []
2952
2953 hdcore_sign = 'hdcore=3.7.0'
2954 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2955 hds_host = hosts.get('hds')
2956 if hds_host:
2957 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2958 if 'hdcore=' not in f4m_url:
2959 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2960 f4m_formats = self._extract_f4m_formats(
2961 f4m_url, video_id, f4m_id='hds', fatal=False)
2962 for entry in f4m_formats:
2963 entry.update({'extra_param_to_segment_url': hdcore_sign})
2964 formats.extend(f4m_formats)
2965
2966 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2967 hls_host = hosts.get('hls')
2968 if hls_host:
2969 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2970 m3u8_formats = self._extract_m3u8_formats(
2971 m3u8_url, video_id, 'mp4', 'm3u8_native',
2972 m3u8_id='hls', fatal=False)
2973 formats.extend(m3u8_formats)
2974
2975 http_host = hosts.get('http')
2976 if http_host and m3u8_formats and not signed:
2977 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2978 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2979 qualities_length = len(qualities)
2980 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2981 i = 0
2982 for f in m3u8_formats:
2983 if f['vcodec'] != 'none':
2984 for protocol in ('http', 'https'):
2985 http_f = f.copy()
2986 del http_f['manifest_url']
2987 http_url = re.sub(
2988 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2989 http_f.update({
2990 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2991 'url': http_url,
2992 'protocol': protocol,
2993 })
2994 formats.append(http_f)
2995 i += 1
2996
2997 return formats
2998
2999 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3000 query = compat_urlparse.urlparse(url).query
3001 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3002 mobj = re.search(
3003 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3004 url_base = mobj.group('url')
3005 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3006 formats = []
3007
3008 def manifest_url(manifest):
3009 m_url = '%s/%s' % (http_base_url, manifest)
3010 if query:
3011 m_url += '?%s' % query
3012 return m_url
3013
3014 if 'm3u8' not in skip_protocols:
3015 formats.extend(self._extract_m3u8_formats(
3016 manifest_url('playlist.m3u8'), video_id, 'mp4',
3017 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3018 if 'f4m' not in skip_protocols:
3019 formats.extend(self._extract_f4m_formats(
3020 manifest_url('manifest.f4m'),
3021 video_id, f4m_id='hds', fatal=False))
3022 if 'dash' not in skip_protocols:
3023 formats.extend(self._extract_mpd_formats(
3024 manifest_url('manifest.mpd'),
3025 video_id, mpd_id='dash', fatal=False))
3026 if re.search(r'(?:/smil:|\.smil)', url_base):
3027 if 'smil' not in skip_protocols:
3028 rtmp_formats = self._extract_smil_formats(
3029 manifest_url('jwplayer.smil'),
3030 video_id, fatal=False)
3031 for rtmp_format in rtmp_formats:
3032 rtsp_format = rtmp_format.copy()
3033 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3034 del rtsp_format['play_path']
3035 del rtsp_format['ext']
3036 rtsp_format.update({
3037 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3038 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3039 'protocol': 'rtsp',
3040 })
3041 formats.extend([rtmp_format, rtsp_format])
3042 else:
3043 for protocol in ('rtmp', 'rtsp'):
3044 if protocol not in skip_protocols:
3045 formats.append({
3046 'url': '%s:%s' % (protocol, url_base),
3047 'format_id': protocol,
3048 'protocol': protocol,
3049 })
3050 return formats
3051
3052 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3053 mobj = re.search(
3054 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3055 webpage)
3056 if mobj:
3057 try:
3058 jwplayer_data = self._parse_json(mobj.group('options'),
3059 video_id=video_id,
3060 transform_source=transform_source)
3061 except ExtractorError:
3062 pass
3063 else:
3064 if isinstance(jwplayer_data, dict):
3065 return jwplayer_data
3066
3067 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3068 jwplayer_data = self._find_jwplayer_data(
3069 webpage, video_id, transform_source=js_to_json)
3070 return self._parse_jwplayer_data(
3071 jwplayer_data, video_id, *args, **kwargs)
3072
3073 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3074 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3075 # JWPlayer backward compatibility: flattened playlists
3076 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3077 if 'playlist' not in jwplayer_data:
3078 jwplayer_data = {'playlist': [jwplayer_data]}
3079
3080 entries = []
3081
3082 # JWPlayer backward compatibility: single playlist item
3083 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3084 if not isinstance(jwplayer_data['playlist'], list):
3085 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3086
3087 for video_data in jwplayer_data['playlist']:
3088 # JWPlayer backward compatibility: flattened sources
3089 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3090 if 'sources' not in video_data:
3091 video_data['sources'] = [video_data]
3092
3093 this_video_id = video_id or video_data['mediaid']
3094
3095 formats = self._parse_jwplayer_formats(
3096 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3097 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3098
3099 subtitles = {}
3100 tracks = video_data.get('tracks')
3101 if tracks and isinstance(tracks, list):
3102 for track in tracks:
3103 if not isinstance(track, dict):
3104 continue
3105 track_kind = track.get('kind')
3106 if not track_kind or not isinstance(track_kind, compat_str):
3107 continue
3108 if track_kind.lower() not in ('captions', 'subtitles'):
3109 continue
3110 track_url = urljoin(base_url, track.get('file'))
3111 if not track_url:
3112 continue
3113 subtitles.setdefault(track.get('label') or 'en', []).append({
3114 'url': self._proto_relative_url(track_url)
3115 })
3116
3117 entry = {
3118 'id': this_video_id,
3119 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3120 'description': clean_html(video_data.get('description')),
3121 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3122 'timestamp': int_or_none(video_data.get('pubdate')),
3123 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3124 'subtitles': subtitles,
3125 }
3126 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3127 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3128 entry.update({
3129 '_type': 'url_transparent',
3130 'url': formats[0]['url'],
3131 })
3132 else:
3133 self._sort_formats(formats)
3134 entry['formats'] = formats
3135 entries.append(entry)
3136 if len(entries) == 1:
3137 return entries[0]
3138 else:
3139 return self.playlist_result(entries)
3140
3141 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3142 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3143 urls = []
3144 formats = []
3145 for source in jwplayer_sources_data:
3146 if not isinstance(source, dict):
3147 continue
3148 source_url = urljoin(
3149 base_url, self._proto_relative_url(source.get('file')))
3150 if not source_url or source_url in urls:
3151 continue
3152 urls.append(source_url)
3153 source_type = source.get('type') or ''
3154 ext = mimetype2ext(source_type) or determine_ext(source_url)
3155 if source_type == 'hls' or ext == 'm3u8':
3156 formats.extend(self._extract_m3u8_formats(
3157 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3158 m3u8_id=m3u8_id, fatal=False))
3159 elif source_type == 'dash' or ext == 'mpd':
3160 formats.extend(self._extract_mpd_formats(
3161 source_url, video_id, mpd_id=mpd_id, fatal=False))
3162 elif ext == 'smil':
3163 formats.extend(self._extract_smil_formats(
3164 source_url, video_id, fatal=False))
3165 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3166 elif source_type.startswith('audio') or ext in (
3167 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3168 formats.append({
3169 'url': source_url,
3170 'vcodec': 'none',
3171 'ext': ext,
3172 })
3173 else:
3174 height = int_or_none(source.get('height'))
3175 if height is None:
3176 # Often no height is provided but there is a label in
3177 # format like "1080p", "720p SD", or 1080.
3178 height = int_or_none(self._search_regex(
3179 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3180 'height', default=None))
3181 a_format = {
3182 'url': source_url,
3183 'width': int_or_none(source.get('width')),
3184 'height': height,
3185 'tbr': int_or_none(source.get('bitrate')),
3186 'ext': ext,
3187 }
3188 if source_url.startswith('rtmp'):
3189 a_format['ext'] = 'flv'
3190 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3191 # of jwplayer.flash.swf
3192 rtmp_url_parts = re.split(
3193 r'((?:mp4|mp3|flv):)', source_url, 1)
3194 if len(rtmp_url_parts) == 3:
3195 rtmp_url, prefix, play_path = rtmp_url_parts
3196 a_format.update({
3197 'url': rtmp_url,
3198 'play_path': prefix + play_path,
3199 })
3200 if rtmp_params:
3201 a_format.update(rtmp_params)
3202 formats.append(a_format)
3203 return formats
3204
3205 def _live_title(self, name):
3206 """ Generate the title for a live video """
3207 now = datetime.datetime.now()
3208 now_str = now.strftime('%Y-%m-%d %H:%M')
3209 return name + ' ' + now_str
3210
3211 def _int(self, v, name, fatal=False, **kwargs):
3212 res = int_or_none(v, **kwargs)
3213 if 'get_attr' in kwargs:
3214 print(getattr(v, kwargs['get_attr']))
3215 if res is None:
3216 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3217 if fatal:
3218 raise ExtractorError(msg)
3219 else:
3220 self.report_warning(msg)
3221 return res
3222
3223 def _float(self, v, name, fatal=False, **kwargs):
3224 res = float_or_none(v, **kwargs)
3225 if res is None:
3226 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3227 if fatal:
3228 raise ExtractorError(msg)
3229 else:
3230 self.report_warning(msg)
3231 return res
3232
3233 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3234 path='/', secure=False, discard=False, rest={}, **kwargs):
3235 cookie = compat_cookiejar_Cookie(
3236 0, name, value, port, port is not None, domain, True,
3237 domain.startswith('.'), path, True, secure, expire_time,
3238 discard, None, None, rest)
3239 self._downloader.cookiejar.set_cookie(cookie)
3240
3241 def _get_cookies(self, url):
3242 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3243 req = sanitized_Request(url)
3244 self._downloader.cookiejar.add_cookie_header(req)
3245 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3246
3247 def _apply_first_set_cookie_header(self, url_handle, cookie):
3248 """
3249 Apply first Set-Cookie header instead of the last. Experimental.
3250
3251 Some sites (e.g. [1-3]) may serve two cookies under the same name
3252 in Set-Cookie header and expect the first (old) one to be set rather
3253 than second (new). However, as of RFC6265 the newer one cookie
3254 should be set into cookie store what actually happens.
3255 We will workaround this issue by resetting the cookie to
3256 the first one manually.
3257 1. https://new.vk.com/
3258 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3259 3. https://learning.oreilly.com/
3260 """
3261 for header, cookies in url_handle.headers.items():
3262 if header.lower() != 'set-cookie':
3263 continue
3264 if sys.version_info[0] >= 3:
3265 cookies = cookies.encode('iso-8859-1')
3266 cookies = cookies.decode('utf-8')
3267 cookie_value = re.search(
3268 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3269 if cookie_value:
3270 value, domain = cookie_value.groups()
3271 self._set_cookie(domain, cookie, value)
3272 break
3273
3274 def get_testcases(self, include_onlymatching=False):
3275 t = getattr(self, '_TEST', None)
3276 if t:
3277 assert not hasattr(self, '_TESTS'), \
3278 '%s has _TEST and _TESTS' % type(self).__name__
3279 tests = [t]
3280 else:
3281 tests = getattr(self, '_TESTS', [])
3282 for t in tests:
3283 if not include_onlymatching and t.get('only_matching', False):
3284 continue
3285 t['name'] = type(self).__name__[:-len('IE')]
3286 yield t
3287
3288 def is_suitable(self, age_limit):
3289 """ Test whether the extractor is generally suitable for the given
3290 age limit (i.e. pornographic sites are not, all others usually are) """
3291
3292 any_restricted = False
3293 for tc in self.get_testcases(include_onlymatching=False):
3294 if tc.get('playlist', []):
3295 tc = tc['playlist'][0]
3296 is_restricted = age_restricted(
3297 tc.get('info_dict', {}).get('age_limit'), age_limit)
3298 if not is_restricted:
3299 return True
3300 any_restricted = any_restricted or is_restricted
3301 return not any_restricted
3302
3303 def extract_subtitles(self, *args, **kwargs):
3304 if (self._downloader.params.get('writesubtitles', False)
3305 or self._downloader.params.get('listsubtitles')):
3306 return self._get_subtitles(*args, **kwargs)
3307 return {}
3308
3309 def _get_subtitles(self, *args, **kwargs):
3310 raise NotImplementedError('This method must be implemented by subclasses')
3311
3312 @staticmethod
3313 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3314 """ Merge subtitle items for one language. Items with duplicated URLs
3315 will be dropped. """
3316 list1_urls = set([item['url'] for item in subtitle_list1])
3317 ret = list(subtitle_list1)
3318 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3319 return ret
3320
3321 @classmethod
3322 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3323 """ Merge two subtitle dictionaries, language by language. """
3324 ret = dict(subtitle_dict1)
3325 for lang in subtitle_dict2:
3326 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3327 return ret
3328
3329 def extract_automatic_captions(self, *args, **kwargs):
3330 if (self._downloader.params.get('writeautomaticsub', False)
3331 or self._downloader.params.get('listsubtitles')):
3332 return self._get_automatic_captions(*args, **kwargs)
3333 return {}
3334
3335 def _get_automatic_captions(self, *args, **kwargs):
3336 raise NotImplementedError('This method must be implemented by subclasses')
3337
3338 def mark_watched(self, *args, **kwargs):
3339 if (self._downloader.params.get('mark_watched', False)
3340 and (self._get_login_info()[0] is not None
3341 or self._downloader.params.get('cookiefile') is not None)):
3342 self._mark_watched(*args, **kwargs)
3343
3344 def _mark_watched(self, *args, **kwargs):
3345 raise NotImplementedError('This method must be implemented by subclasses')
3346
3347 def geo_verification_headers(self):
3348 headers = {}
3349 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3350 if geo_verification_proxy:
3351 headers['Ytdl-request-proxy'] = geo_verification_proxy
3352 return headers
3353
3354 def _generic_id(self, url):
3355 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3356
3357 def _generic_title(self, url):
3358 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3359
3360 @staticmethod
3361 def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3362 all_known = all(map(
3363 lambda x: x is not None,
3364 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3365 return (
3366 'private' if is_private
3367 else 'premium_only' if needs_premium
3368 else 'subscriber_only' if needs_subscription
3369 else 'needs_auth' if needs_auth
3370 else 'unlisted' if is_unlisted
3371 else 'public' if all_known
3372 else None)
3373
3374
3375 class SearchInfoExtractor(InfoExtractor):
3376 """
3377 Base class for paged search queries extractors.
3378 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3379 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3380 """
3381
3382 @classmethod
3383 def _make_valid_url(cls):
3384 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3385
3386 @classmethod
3387 def suitable(cls, url):
3388 return re.match(cls._make_valid_url(), url) is not None
3389
3390 def _real_extract(self, query):
3391 mobj = re.match(self._make_valid_url(), query)
3392 if mobj is None:
3393 raise ExtractorError('Invalid search query "%s"' % query)
3394
3395 prefix = mobj.group('prefix')
3396 query = mobj.group('query')
3397 if prefix == '':
3398 return self._get_n_results(query, 1)
3399 elif prefix == 'all':
3400 return self._get_n_results(query, self._MAX_RESULTS)
3401 else:
3402 n = int(prefix)
3403 if n <= 0:
3404 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3405 elif n > self._MAX_RESULTS:
3406 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3407 n = self._MAX_RESULTS
3408 return self._get_n_results(query, n)
3409
3410 def _get_n_results(self, query, n):
3411 """Get a specified number of results for a query"""
3412 raise NotImplementedError('This method must be implemented by subclasses')
3413
3414 @property
3415 def SEARCH_KEY(self):
3416 return self._SEARCH_KEY