]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
47b91a00a7a6e0c18c434ce8dcf0c8ca48785055
[yt-dlp.git] / yt_dlp / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import ssl
14 import sys
15 import time
16 import math
17
18 from ..compat import (
19 compat_cookiejar_Cookie,
20 compat_cookies,
21 compat_etree_Element,
22 compat_etree_fromstring,
23 compat_getpass,
24 compat_integer_types,
25 compat_http_client,
26 compat_os_name,
27 compat_str,
28 compat_urllib_error,
29 compat_urllib_parse_unquote,
30 compat_urllib_parse_urlencode,
31 compat_urllib_request,
32 compat_urlparse,
33 compat_xml_parse_error,
34 )
35 from ..downloader import FileDownloader
36 from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39 )
40 from ..utils import (
41 NO_DEFAULT,
42 age_restricted,
43 base_url,
44 bug_reports_message,
45 clean_html,
46 compiled_regex_type,
47 determine_ext,
48 determine_protocol,
49 dict_get,
50 error_to_compat_str,
51 ExtractorError,
52 extract_attributes,
53 fix_xml_ampersands,
54 float_or_none,
55 GeoRestrictedError,
56 GeoUtils,
57 int_or_none,
58 js_to_json,
59 JSON_LD_RE,
60 mimetype2ext,
61 orderedSet,
62 parse_bitrate,
63 parse_codecs,
64 parse_duration,
65 parse_iso8601,
66 parse_m3u8_attributes,
67 parse_resolution,
68 RegexNotFoundError,
69 sanitized_Request,
70 sanitize_filename,
71 str_or_none,
72 str_to_int,
73 strip_or_none,
74 unescapeHTML,
75 unified_strdate,
76 unified_timestamp,
77 update_Request,
78 update_url_query,
79 urljoin,
80 url_basename,
81 url_or_none,
82 xpath_element,
83 xpath_text,
84 xpath_with_ns,
85 )
86
87
88 class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
95 passed to the YoutubeDL. The YoutubeDL processes this
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
99 The type field determines the type of the result.
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
104
105 id: Video identifier.
106 title: Video title, unescaped.
107
108 Additionally, it must contain either a formats entry or a url one:
109
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
124 is parsed from a string (in case of
125 fragmented media)
126 for MSS - URL of the ISM manifest.
127 * manifest_url
128 The URL of the manifest file in case of
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
134 * ext Will be calculated from URL if missing
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
146 * resolution Textual description of width and height
147 * tbr Average bitrate of audio and video in KBit/s
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
150 * asr Audio sampling rate in Hertz
151 * vbr Average video bitrate in KBit/s
152 * fps Frame rate
153 * vcodec Name of the video codec in use
154 * container Name of the container format
155 * filesize The number of bytes, if known in advance
156 * filesize_approx An estimate for the number of bytes
157 * player_url SWF Player URL (used for rtmpdump).
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
160 "http", "https", "rtsp", "rtmp", "rtmpe",
161 "m3u8", "m3u8_native" or "http_dash_segments".
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
177 * preference Order number of this format. If this field is
178 present and not None, the formats get sorted
179 by this field, regardless of all other values.
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
200 * stretched_ratio If given and not 1, indicates that the
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
207
208 url: Final video URL.
209 ext: Video filename extension.
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
212
213 The following fields are optional:
214
215 alt_title: A secondary title of the video.
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
220 thumbnails: A list of dictionaries, with the following entries:
221 * "id" (optional, string) - Thumbnail format ID
222 * "url"
223 * "preference" (optional, int) - quality of the image
224 * "width" (optional, int)
225 * "height" (optional, int)
226 * "resolution" (optional, string "{width}x{height}",
227 deprecated)
228 * "filesize" (optional, int)
229 thumbnail: Full URL to a video thumbnail image.
230 description: Full video description.
231 uploader: Full name of the video uploader.
232 license: License name the video is licensed under.
233 creator: The creator of the video.
234 release_date: The date (YYYYMMDD) when the video was released.
235 timestamp: UNIX timestamp of the moment the video became available.
236 upload_date: Video upload date (YYYYMMDD).
237 If not explicitly set, calculated from timestamp.
238 uploader_id: Nickname or id of the video uploader.
239 uploader_url: Full URL to a personal webpage of the video uploader.
240 channel: Full name of the channel the video is uploaded on.
241 Note that channel fields may or may not repeat uploader
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
245 location: Physical location where the video was filmed.
246 subtitles: The available subtitles as a dictionary in the format
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
251 * "data": The subtitles file contents
252 * "url": A URL pointing to the subtitles file
253 "ext" will be calculated from URL if missing
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
256 duration: Length of the video in seconds, as an integer or float.
257 view_count: How many users have watched the video on the platform.
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
260 repost_count: Number of reposts of the video
261 average_rating: Average rating give by users, the scale used depends on the webpage
262 comment_count: Number of comments on the video
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
267 * "id" - Comment ID
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
274 age_limit: Age restriction for the video, as an integer (years)
275 webpage_url: The URL to the video webpage, if given to yt-dlp it
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
278 categories: A list of categories that the video falls in, for example
279 ["Sports", "Berlin"]
280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
285 start_time: Time in seconds where the reproduction should start, as
286 specified in the URL.
287 end_time: Time in seconds where the reproduction should end, as
288 specified in the URL.
289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
293 playable_in_embed: Whether this video is allowed to play in embedded
294 players on other sites. Can be True (=always allowed),
295 False (=never allowed), None (=unknown), or a string
296 specifying the criteria for embedability (Eg: 'whitelist').
297
298 The following fields should only be used when the video belongs to some logical
299 chapter or section:
300
301 chapter: Name or title of the chapter the video belongs to.
302 chapter_number: Number of the chapter the video belongs to, as an integer.
303 chapter_id: Id of the chapter the video belongs to, as a unicode string.
304
305 The following fields should only be used when the video is an episode of some
306 series, programme or podcast:
307
308 series: Title of the series or programme the video episode belongs to.
309 season: Title of the season the video episode belongs to.
310 season_number: Number of the season the video episode belongs to, as an integer.
311 season_id: Id of the season the video episode belongs to, as a unicode string.
312 episode: Title of the video episode. Unlike mandatory video title field,
313 this field should denote the exact title of the video episode
314 without any kind of decoration.
315 episode_number: Number of the video episode within a season, as an integer.
316 episode_id: Id of the video episode, as a unicode string.
317
318 The following fields should only be used when the media is a track or a part of
319 a music album:
320
321 track: Title of the track.
322 track_number: Number of the track within an album or a disc, as an integer.
323 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
324 as a unicode string.
325 artist: Artist(s) of the track.
326 genre: Genre(s) of the track.
327 album: Title of the album the track belongs to.
328 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
329 album_artist: List of all artists appeared on the album (e.g.
330 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
331 and compilations).
332 disc_number: Number of the disc or other physical medium the track belongs to,
333 as an integer.
334 release_year: Year (YYYY) when the album was released.
335
336 Unless mentioned otherwise, the fields should be Unicode strings.
337
338 Unless mentioned otherwise, None is equivalent to absence of information.
339
340
341 _type "playlist" indicates multiple videos.
342 There must be a key "entries", which is a list, an iterable, or a PagedList
343 object, each element of which is a valid dictionary by this specification.
344
345 Additionally, playlists can have "id", "title", and any other relevent
346 attributes with the same semantics as videos (see above).
347
348
349 _type "multi_video" indicates that there are multiple videos that
350 form a single show, for examples multiple acts of an opera or TV episode.
351 It must have an entries key like a playlist and contain all the keys
352 required for a video at the same time.
353
354
355 _type "url" indicates that the video must be extracted from another
356 location, possibly by a different extractor. Its only required key is:
357 "url" - the next URL to extract.
358 The key "ie_key" can be set to the class name (minus the trailing "IE",
359 e.g. "Youtube") if the extractor class is known in advance.
360 Additionally, the dictionary may have any properties of the resolved entity
361 known in advance, for example "title" if the title of the referred video is
362 known ahead of time.
363
364
365 _type "url_transparent" entities have the same specification as "url", but
366 indicate that the given additional information is more precise than the one
367 associated with the resolved URL.
368 This is useful when a site employs a video service that hosts the video and
369 its technical metadata, but that video service does not embed a useful
370 title, description etc.
371
372
373 Subclasses of this one should re-define the _real_initialize() and
374 _real_extract() methods and define a _VALID_URL regexp.
375 Probably, they should also be added to the list of extractors.
376
377 _GEO_BYPASS attribute may be set to False in order to disable
378 geo restriction bypass mechanisms for a particular extractor.
379 Though it won't disable explicit geo restriction bypass based on
380 country code provided with geo_bypass_country.
381
382 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
383 countries for this extractor. One of these countries will be used by
384 geo restriction bypass mechanism right away in order to bypass
385 geo restriction, of course, if the mechanism is not disabled.
386
387 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
388 IP blocks in CIDR notation for this extractor. One of these IP blocks
389 will be used by geo restriction bypass mechanism similarly
390 to _GEO_COUNTRIES.
391
392 Finally, the _WORKING attribute should be set to False for broken IEs
393 in order to warn the users and skip the tests.
394 """
395
396 _ready = False
397 _downloader = None
398 _x_forwarded_for_ip = None
399 _GEO_BYPASS = True
400 _GEO_COUNTRIES = None
401 _GEO_IP_BLOCKS = None
402 _WORKING = True
403
404 def __init__(self, downloader=None):
405 """Constructor. Receives an optional downloader."""
406 self._ready = False
407 self._x_forwarded_for_ip = None
408 self.set_downloader(downloader)
409
410 @classmethod
411 def suitable(cls, url):
412 """Receives a URL and returns True if suitable for this IE."""
413
414 # This does not use has/getattr intentionally - we want to know whether
415 # we have cached the regexp for *this* class, whereas getattr would also
416 # match the superclass
417 if '_VALID_URL_RE' not in cls.__dict__:
418 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
419 return cls._VALID_URL_RE.match(url) is not None
420
421 @classmethod
422 def _match_id(cls, url):
423 if '_VALID_URL_RE' not in cls.__dict__:
424 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
425 m = cls._VALID_URL_RE.match(url)
426 assert m
427 return compat_str(m.group('id'))
428
429 @classmethod
430 def working(cls):
431 """Getter method for _WORKING."""
432 return cls._WORKING
433
434 def initialize(self):
435 """Initializes an instance (authentication, etc)."""
436 self._initialize_geo_bypass({
437 'countries': self._GEO_COUNTRIES,
438 'ip_blocks': self._GEO_IP_BLOCKS,
439 })
440 if not self._ready:
441 self._real_initialize()
442 self._ready = True
443
444 def _initialize_geo_bypass(self, geo_bypass_context):
445 """
446 Initialize geo restriction bypass mechanism.
447
448 This method is used to initialize geo bypass mechanism based on faking
449 X-Forwarded-For HTTP header. A random country from provided country list
450 is selected and a random IP belonging to this country is generated. This
451 IP will be passed as X-Forwarded-For HTTP header in all subsequent
452 HTTP requests.
453
454 This method will be used for initial geo bypass mechanism initialization
455 during the instance initialization with _GEO_COUNTRIES and
456 _GEO_IP_BLOCKS.
457
458 You may also manually call it from extractor's code if geo bypass
459 information is not available beforehand (e.g. obtained during
460 extraction) or due to some other reason. In this case you should pass
461 this information in geo bypass context passed as first argument. It may
462 contain following fields:
463
464 countries: List of geo unrestricted countries (similar
465 to _GEO_COUNTRIES)
466 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
467 (similar to _GEO_IP_BLOCKS)
468
469 """
470 if not self._x_forwarded_for_ip:
471
472 # Geo bypass mechanism is explicitly disabled by user
473 if not self._downloader.params.get('geo_bypass', True):
474 return
475
476 if not geo_bypass_context:
477 geo_bypass_context = {}
478
479 # Backward compatibility: previously _initialize_geo_bypass
480 # expected a list of countries, some 3rd party code may still use
481 # it this way
482 if isinstance(geo_bypass_context, (list, tuple)):
483 geo_bypass_context = {
484 'countries': geo_bypass_context,
485 }
486
487 # The whole point of geo bypass mechanism is to fake IP
488 # as X-Forwarded-For HTTP header based on some IP block or
489 # country code.
490
491 # Path 1: bypassing based on IP block in CIDR notation
492
493 # Explicit IP block specified by user, use it right away
494 # regardless of whether extractor is geo bypassable or not
495 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
496
497 # Otherwise use random IP block from geo bypass context but only
498 # if extractor is known as geo bypassable
499 if not ip_block:
500 ip_blocks = geo_bypass_context.get('ip_blocks')
501 if self._GEO_BYPASS and ip_blocks:
502 ip_block = random.choice(ip_blocks)
503
504 if ip_block:
505 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
506 if self._downloader.params.get('verbose', False):
507 self._downloader.to_screen(
508 '[debug] Using fake IP %s as X-Forwarded-For.'
509 % self._x_forwarded_for_ip)
510 return
511
512 # Path 2: bypassing based on country code
513
514 # Explicit country code specified by user, use it right away
515 # regardless of whether extractor is geo bypassable or not
516 country = self._downloader.params.get('geo_bypass_country', None)
517
518 # Otherwise use random country code from geo bypass context but
519 # only if extractor is known as geo bypassable
520 if not country:
521 countries = geo_bypass_context.get('countries')
522 if self._GEO_BYPASS and countries:
523 country = random.choice(countries)
524
525 if country:
526 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
527 if self._downloader.params.get('verbose', False):
528 self._downloader.to_screen(
529 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
530 % (self._x_forwarded_for_ip, country.upper()))
531
532 def extract(self, url):
533 """Extracts URL information and returns it in list of dicts."""
534 try:
535 for _ in range(2):
536 try:
537 self.initialize()
538 ie_result = self._real_extract(url)
539 if self._x_forwarded_for_ip:
540 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
541 return ie_result
542 except GeoRestrictedError as e:
543 if self.__maybe_fake_ip_and_retry(e.countries):
544 continue
545 raise
546 except ExtractorError:
547 raise
548 except compat_http_client.IncompleteRead as e:
549 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
550 except (KeyError, StopIteration) as e:
551 raise ExtractorError('An extractor error has occurred.', cause=e)
552
553 def __maybe_fake_ip_and_retry(self, countries):
554 if (not self._downloader.params.get('geo_bypass_country', None)
555 and self._GEO_BYPASS
556 and self._downloader.params.get('geo_bypass', True)
557 and not self._x_forwarded_for_ip
558 and countries):
559 country_code = random.choice(countries)
560 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
561 if self._x_forwarded_for_ip:
562 self.report_warning(
563 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
564 % (self._x_forwarded_for_ip, country_code.upper()))
565 return True
566 return False
567
568 def set_downloader(self, downloader):
569 """Sets the downloader for this IE."""
570 self._downloader = downloader
571
572 def _real_initialize(self):
573 """Real initialization process. Redefine in subclasses."""
574 pass
575
576 def _real_extract(self, url):
577 """Real extraction process. Redefine in subclasses."""
578 pass
579
580 @classmethod
581 def ie_key(cls):
582 """A string for getting the InfoExtractor with get_info_extractor"""
583 return compat_str(cls.__name__[:-2])
584
585 @property
586 def IE_NAME(self):
587 return compat_str(type(self).__name__[:-2])
588
589 @staticmethod
590 def __can_accept_status_code(err, expected_status):
591 assert isinstance(err, compat_urllib_error.HTTPError)
592 if expected_status is None:
593 return False
594 if isinstance(expected_status, compat_integer_types):
595 return err.code == expected_status
596 elif isinstance(expected_status, (list, tuple)):
597 return err.code in expected_status
598 elif callable(expected_status):
599 return expected_status(err.code) is True
600 else:
601 assert False
602
603 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
604 """
605 Return the response handle.
606
607 See _download_webpage docstring for arguments specification.
608 """
609 if not self._downloader._first_webpage_request:
610 sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
611 if sleep_interval > 0:
612 self.to_screen('Sleeping %s seconds...' % sleep_interval)
613 time.sleep(sleep_interval)
614 else:
615 self._downloader._first_webpage_request = False
616
617 if note is None:
618 self.report_download_webpage(video_id)
619 elif note is not False:
620 if video_id is None:
621 self.to_screen('%s' % (note,))
622 else:
623 self.to_screen('%s: %s' % (video_id, note))
624
625 # Some sites check X-Forwarded-For HTTP header in order to figure out
626 # the origin of the client behind proxy. This allows bypassing geo
627 # restriction by faking this header's value to IP that belongs to some
628 # geo unrestricted country. We will do so once we encounter any
629 # geo restriction error.
630 if self._x_forwarded_for_ip:
631 if 'X-Forwarded-For' not in headers:
632 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
633
634 if isinstance(url_or_request, compat_urllib_request.Request):
635 url_or_request = update_Request(
636 url_or_request, data=data, headers=headers, query=query)
637 else:
638 if query:
639 url_or_request = update_url_query(url_or_request, query)
640 if data is not None or headers:
641 url_or_request = sanitized_Request(url_or_request, data, headers)
642 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
643 if hasattr(ssl, 'CertificateError'):
644 exceptions.append(ssl.CertificateError)
645 try:
646 return self._downloader.urlopen(url_or_request)
647 except tuple(exceptions) as err:
648 if isinstance(err, compat_urllib_error.HTTPError):
649 if self.__can_accept_status_code(err, expected_status):
650 # Retain reference to error to prevent file object from
651 # being closed before it can be read. Works around the
652 # effects of <https://bugs.python.org/issue15002>
653 # introduced in Python 3.4.1.
654 err.fp._error = err
655 return err.fp
656
657 if errnote is False:
658 return False
659 if errnote is None:
660 errnote = 'Unable to download webpage'
661
662 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
663 if fatal:
664 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
665 else:
666 self._downloader.report_warning(errmsg)
667 return False
668
669 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
670 """
671 Return a tuple (page content as string, URL handle).
672
673 See _download_webpage docstring for arguments specification.
674 """
675 # Strip hashes from the URL (#1038)
676 if isinstance(url_or_request, (compat_str, str)):
677 url_or_request = url_or_request.partition('#')[0]
678
679 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
680 if urlh is False:
681 assert not fatal
682 return False
683 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
684 return (content, urlh)
685
686 @staticmethod
687 def _guess_encoding_from_content(content_type, webpage_bytes):
688 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
689 if m:
690 encoding = m.group(1)
691 else:
692 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
693 webpage_bytes[:1024])
694 if m:
695 encoding = m.group(1).decode('ascii')
696 elif webpage_bytes.startswith(b'\xff\xfe'):
697 encoding = 'utf-16'
698 else:
699 encoding = 'utf-8'
700
701 return encoding
702
703 def __check_blocked(self, content):
704 first_block = content[:512]
705 if ('<title>Access to this site is blocked</title>' in content
706 and 'Websense' in first_block):
707 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
708 blocked_iframe = self._html_search_regex(
709 r'<iframe src="([^"]+)"', content,
710 'Websense information URL', default=None)
711 if blocked_iframe:
712 msg += ' Visit %s for more details' % blocked_iframe
713 raise ExtractorError(msg, expected=True)
714 if '<title>The URL you requested has been blocked</title>' in first_block:
715 msg = (
716 'Access to this webpage has been blocked by Indian censorship. '
717 'Use a VPN or proxy server (with --proxy) to route around it.')
718 block_msg = self._html_search_regex(
719 r'</h1><p>(.*?)</p>',
720 content, 'block message', default=None)
721 if block_msg:
722 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
723 raise ExtractorError(msg, expected=True)
724 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
725 and 'blocklist.rkn.gov.ru' in content):
726 raise ExtractorError(
727 'Access to this webpage has been blocked by decision of the Russian government. '
728 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
729 expected=True)
730
731 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
732 content_type = urlh.headers.get('Content-Type', '')
733 webpage_bytes = urlh.read()
734 if prefix is not None:
735 webpage_bytes = prefix + webpage_bytes
736 if not encoding:
737 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
738 if self._downloader.params.get('dump_intermediate_pages', False):
739 self.to_screen('Dumping request to ' + urlh.geturl())
740 dump = base64.b64encode(webpage_bytes).decode('ascii')
741 self._downloader.to_screen(dump)
742 if self._downloader.params.get('write_pages', False):
743 basen = '%s_%s' % (video_id, urlh.geturl())
744 if len(basen) > 240:
745 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
746 basen = basen[:240 - len(h)] + h
747 raw_filename = basen + '.dump'
748 filename = sanitize_filename(raw_filename, restricted=True)
749 self.to_screen('Saving request to ' + filename)
750 # Working around MAX_PATH limitation on Windows (see
751 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
752 if compat_os_name == 'nt':
753 absfilepath = os.path.abspath(filename)
754 if len(absfilepath) > 259:
755 filename = '\\\\?\\' + absfilepath
756 with open(filename, 'wb') as outf:
757 outf.write(webpage_bytes)
758
759 try:
760 content = webpage_bytes.decode(encoding, 'replace')
761 except LookupError:
762 content = webpage_bytes.decode('utf-8', 'replace')
763
764 self.__check_blocked(content)
765
766 return content
767
768 def _download_webpage(
769 self, url_or_request, video_id, note=None, errnote=None,
770 fatal=True, tries=1, timeout=5, encoding=None, data=None,
771 headers={}, query={}, expected_status=None):
772 """
773 Return the data of the page as a string.
774
775 Arguments:
776 url_or_request -- plain text URL as a string or
777 a compat_urllib_request.Requestobject
778 video_id -- Video/playlist/item identifier (string)
779
780 Keyword arguments:
781 note -- note printed before downloading (string)
782 errnote -- note printed in case of an error (string)
783 fatal -- flag denoting whether error should be considered fatal,
784 i.e. whether it should cause ExtractionError to be raised,
785 otherwise a warning will be reported and extraction continued
786 tries -- number of tries
787 timeout -- sleep interval between tries
788 encoding -- encoding for a page content decoding, guessed automatically
789 when not explicitly specified
790 data -- POST data (bytes)
791 headers -- HTTP headers (dict)
792 query -- URL query (dict)
793 expected_status -- allows to accept failed HTTP requests (non 2xx
794 status code) by explicitly specifying a set of accepted status
795 codes. Can be any of the following entities:
796 - an integer type specifying an exact failed status code to
797 accept
798 - a list or a tuple of integer types specifying a list of
799 failed status codes to accept
800 - a callable accepting an actual failed status code and
801 returning True if it should be accepted
802 Note that this argument does not affect success status codes (2xx)
803 which are always accepted.
804 """
805
806 success = False
807 try_count = 0
808 while success is False:
809 try:
810 res = self._download_webpage_handle(
811 url_or_request, video_id, note, errnote, fatal,
812 encoding=encoding, data=data, headers=headers, query=query,
813 expected_status=expected_status)
814 success = True
815 except compat_http_client.IncompleteRead as e:
816 try_count += 1
817 if try_count >= tries:
818 raise e
819 self._sleep(timeout, video_id)
820 if res is False:
821 return res
822 else:
823 content, _ = res
824 return content
825
826 def _download_xml_handle(
827 self, url_or_request, video_id, note='Downloading XML',
828 errnote='Unable to download XML', transform_source=None,
829 fatal=True, encoding=None, data=None, headers={}, query={},
830 expected_status=None):
831 """
832 Return a tuple (xml as an compat_etree_Element, URL handle).
833
834 See _download_webpage docstring for arguments specification.
835 """
836 res = self._download_webpage_handle(
837 url_or_request, video_id, note, errnote, fatal=fatal,
838 encoding=encoding, data=data, headers=headers, query=query,
839 expected_status=expected_status)
840 if res is False:
841 return res
842 xml_string, urlh = res
843 return self._parse_xml(
844 xml_string, video_id, transform_source=transform_source,
845 fatal=fatal), urlh
846
847 def _download_xml(
848 self, url_or_request, video_id,
849 note='Downloading XML', errnote='Unable to download XML',
850 transform_source=None, fatal=True, encoding=None,
851 data=None, headers={}, query={}, expected_status=None):
852 """
853 Return the xml as an compat_etree_Element.
854
855 See _download_webpage docstring for arguments specification.
856 """
857 res = self._download_xml_handle(
858 url_or_request, video_id, note=note, errnote=errnote,
859 transform_source=transform_source, fatal=fatal, encoding=encoding,
860 data=data, headers=headers, query=query,
861 expected_status=expected_status)
862 return res if res is False else res[0]
863
864 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
865 if transform_source:
866 xml_string = transform_source(xml_string)
867 try:
868 return compat_etree_fromstring(xml_string.encode('utf-8'))
869 except compat_xml_parse_error as ve:
870 errmsg = '%s: Failed to parse XML ' % video_id
871 if fatal:
872 raise ExtractorError(errmsg, cause=ve)
873 else:
874 self.report_warning(errmsg + str(ve))
875
876 def _download_json_handle(
877 self, url_or_request, video_id, note='Downloading JSON metadata',
878 errnote='Unable to download JSON metadata', transform_source=None,
879 fatal=True, encoding=None, data=None, headers={}, query={},
880 expected_status=None):
881 """
882 Return a tuple (JSON object, URL handle).
883
884 See _download_webpage docstring for arguments specification.
885 """
886 res = self._download_webpage_handle(
887 url_or_request, video_id, note, errnote, fatal=fatal,
888 encoding=encoding, data=data, headers=headers, query=query,
889 expected_status=expected_status)
890 if res is False:
891 return res
892 json_string, urlh = res
893 return self._parse_json(
894 json_string, video_id, transform_source=transform_source,
895 fatal=fatal), urlh
896
897 def _download_json(
898 self, url_or_request, video_id, note='Downloading JSON metadata',
899 errnote='Unable to download JSON metadata', transform_source=None,
900 fatal=True, encoding=None, data=None, headers={}, query={},
901 expected_status=None):
902 """
903 Return the JSON object as a dict.
904
905 See _download_webpage docstring for arguments specification.
906 """
907 res = self._download_json_handle(
908 url_or_request, video_id, note=note, errnote=errnote,
909 transform_source=transform_source, fatal=fatal, encoding=encoding,
910 data=data, headers=headers, query=query,
911 expected_status=expected_status)
912 return res if res is False else res[0]
913
914 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
915 if transform_source:
916 json_string = transform_source(json_string)
917 try:
918 return json.loads(json_string)
919 except ValueError as ve:
920 errmsg = '%s: Failed to parse JSON ' % video_id
921 if fatal:
922 raise ExtractorError(errmsg, cause=ve)
923 else:
924 self.report_warning(errmsg + str(ve))
925
926 def report_warning(self, msg, video_id=None):
927 idstr = '' if video_id is None else '%s: ' % video_id
928 self._downloader.report_warning(
929 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
930
931 def to_screen(self, msg):
932 """Print msg to screen, prefixing it with '[ie_name]'"""
933 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
934
935 def report_extraction(self, id_or_name):
936 """Report information extraction."""
937 self.to_screen('%s: Extracting information' % id_or_name)
938
939 def report_download_webpage(self, video_id):
940 """Report webpage download."""
941 self.to_screen('%s: Downloading webpage' % video_id)
942
943 def report_age_confirmation(self):
944 """Report attempt to confirm age."""
945 self.to_screen('Confirming age')
946
947 def report_login(self):
948 """Report attempt to log in."""
949 self.to_screen('Logging in')
950
951 @staticmethod
952 def raise_login_required(msg='This video is only available for registered users'):
953 raise ExtractorError(
954 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
955 expected=True)
956
957 @staticmethod
958 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
959 raise GeoRestrictedError(msg, countries=countries)
960
961 # Methods for following #608
962 @staticmethod
963 def url_result(url, ie=None, video_id=None, video_title=None):
964 """Returns a URL that points to a page that should be processed"""
965 # TODO: ie should be the class used for getting the info
966 video_info = {'_type': 'url',
967 'url': url,
968 'ie_key': ie}
969 if video_id is not None:
970 video_info['id'] = video_id
971 if video_title is not None:
972 video_info['title'] = video_title
973 return video_info
974
975 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
976 urls = orderedSet(
977 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
978 for m in matches)
979 return self.playlist_result(
980 urls, playlist_id=playlist_id, playlist_title=playlist_title)
981
982 @staticmethod
983 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
984 """Returns a playlist"""
985 video_info = {'_type': 'playlist',
986 'entries': entries}
987 video_info.update(kwargs)
988 if playlist_id:
989 video_info['id'] = playlist_id
990 if playlist_title:
991 video_info['title'] = playlist_title
992 if playlist_description is not None:
993 video_info['description'] = playlist_description
994 return video_info
995
996 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
997 """
998 Perform a regex search on the given string, using a single or a list of
999 patterns returning the first matching group.
1000 In case of failure return a default value or raise a WARNING or a
1001 RegexNotFoundError, depending on fatal, specifying the field name.
1002 """
1003 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1004 mobj = re.search(pattern, string, flags)
1005 else:
1006 for p in pattern:
1007 mobj = re.search(p, string, flags)
1008 if mobj:
1009 break
1010
1011 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1012 _name = '\033[0;34m%s\033[0m' % name
1013 else:
1014 _name = name
1015
1016 if mobj:
1017 if group is None:
1018 # return the first matching group
1019 return next(g for g in mobj.groups() if g is not None)
1020 else:
1021 return mobj.group(group)
1022 elif default is not NO_DEFAULT:
1023 return default
1024 elif fatal:
1025 raise RegexNotFoundError('Unable to extract %s' % _name)
1026 else:
1027 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1028 return None
1029
1030 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1031 """
1032 Like _search_regex, but strips HTML tags and unescapes entities.
1033 """
1034 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1035 if res:
1036 return clean_html(res).strip()
1037 else:
1038 return res
1039
1040 def _get_netrc_login_info(self, netrc_machine=None):
1041 username = None
1042 password = None
1043 netrc_machine = netrc_machine or self._NETRC_MACHINE
1044
1045 if self._downloader.params.get('usenetrc', False):
1046 try:
1047 info = netrc.netrc().authenticators(netrc_machine)
1048 if info is not None:
1049 username = info[0]
1050 password = info[2]
1051 else:
1052 raise netrc.NetrcParseError(
1053 'No authenticators for %s' % netrc_machine)
1054 except (IOError, netrc.NetrcParseError) as err:
1055 self._downloader.report_warning(
1056 'parsing .netrc: %s' % error_to_compat_str(err))
1057
1058 return username, password
1059
1060 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1061 """
1062 Get the login info as (username, password)
1063 First look for the manually specified credentials using username_option
1064 and password_option as keys in params dictionary. If no such credentials
1065 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1066 value.
1067 If there's no info available, return (None, None)
1068 """
1069 if self._downloader is None:
1070 return (None, None)
1071
1072 downloader_params = self._downloader.params
1073
1074 # Attempt to use provided username and password or .netrc data
1075 if downloader_params.get(username_option) is not None:
1076 username = downloader_params[username_option]
1077 password = downloader_params[password_option]
1078 else:
1079 username, password = self._get_netrc_login_info(netrc_machine)
1080
1081 return username, password
1082
1083 def _get_tfa_info(self, note='two-factor verification code'):
1084 """
1085 Get the two-factor authentication info
1086 TODO - asking the user will be required for sms/phone verify
1087 currently just uses the command line option
1088 If there's no info available, return None
1089 """
1090 if self._downloader is None:
1091 return None
1092 downloader_params = self._downloader.params
1093
1094 if downloader_params.get('twofactor') is not None:
1095 return downloader_params['twofactor']
1096
1097 return compat_getpass('Type %s and press [Return]: ' % note)
1098
1099 # Helper functions for extracting OpenGraph info
1100 @staticmethod
1101 def _og_regexes(prop):
1102 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1103 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1104 % {'prop': re.escape(prop)})
1105 template = r'<meta[^>]+?%s[^>]+?%s'
1106 return [
1107 template % (property_re, content_re),
1108 template % (content_re, property_re),
1109 ]
1110
1111 @staticmethod
1112 def _meta_regex(prop):
1113 return r'''(?isx)<meta
1114 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1115 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1116
1117 def _og_search_property(self, prop, html, name=None, **kargs):
1118 if not isinstance(prop, (list, tuple)):
1119 prop = [prop]
1120 if name is None:
1121 name = 'OpenGraph %s' % prop[0]
1122 og_regexes = []
1123 for p in prop:
1124 og_regexes.extend(self._og_regexes(p))
1125 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1126 if escaped is None:
1127 return None
1128 return unescapeHTML(escaped)
1129
1130 def _og_search_thumbnail(self, html, **kargs):
1131 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1132
1133 def _og_search_description(self, html, **kargs):
1134 return self._og_search_property('description', html, fatal=False, **kargs)
1135
1136 def _og_search_title(self, html, **kargs):
1137 return self._og_search_property('title', html, **kargs)
1138
1139 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1140 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1141 if secure:
1142 regexes = self._og_regexes('video:secure_url') + regexes
1143 return self._html_search_regex(regexes, html, name, **kargs)
1144
1145 def _og_search_url(self, html, **kargs):
1146 return self._og_search_property('url', html, **kargs)
1147
1148 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1149 if not isinstance(name, (list, tuple)):
1150 name = [name]
1151 if display_name is None:
1152 display_name = name[0]
1153 return self._html_search_regex(
1154 [self._meta_regex(n) for n in name],
1155 html, display_name, fatal=fatal, group='content', **kwargs)
1156
1157 def _dc_search_uploader(self, html):
1158 return self._html_search_meta('dc.creator', html, 'uploader')
1159
1160 def _rta_search(self, html):
1161 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1162 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1163 r' content="RTA-5042-1996-1400-1577-RTA"',
1164 html):
1165 return 18
1166 return 0
1167
1168 def _media_rating_search(self, html):
1169 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1170 rating = self._html_search_meta('rating', html)
1171
1172 if not rating:
1173 return None
1174
1175 RATING_TABLE = {
1176 'safe for kids': 0,
1177 'general': 8,
1178 '14 years': 14,
1179 'mature': 17,
1180 'restricted': 19,
1181 }
1182 return RATING_TABLE.get(rating.lower())
1183
1184 def _family_friendly_search(self, html):
1185 # See http://schema.org/VideoObject
1186 family_friendly = self._html_search_meta(
1187 'isFamilyFriendly', html, default=None)
1188
1189 if not family_friendly:
1190 return None
1191
1192 RATING_TABLE = {
1193 '1': 0,
1194 'true': 0,
1195 '0': 18,
1196 'false': 18,
1197 }
1198 return RATING_TABLE.get(family_friendly.lower())
1199
1200 def _twitter_search_player(self, html):
1201 return self._html_search_meta('twitter:player', html,
1202 'twitter card player')
1203
1204 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1205 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1206 default = kwargs.get('default', NO_DEFAULT)
1207 # JSON-LD may be malformed and thus `fatal` should be respected.
1208 # At the same time `default` may be passed that assumes `fatal=False`
1209 # for _search_regex. Let's simulate the same behavior here as well.
1210 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1211 json_ld = []
1212 for mobj in json_ld_list:
1213 json_ld_item = self._parse_json(
1214 mobj.group('json_ld'), video_id, fatal=fatal)
1215 if not json_ld_item:
1216 continue
1217 if isinstance(json_ld_item, dict):
1218 json_ld.append(json_ld_item)
1219 elif isinstance(json_ld_item, (list, tuple)):
1220 json_ld.extend(json_ld_item)
1221 if json_ld:
1222 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1223 if json_ld:
1224 return json_ld
1225 if default is not NO_DEFAULT:
1226 return default
1227 elif fatal:
1228 raise RegexNotFoundError('Unable to extract JSON-LD')
1229 else:
1230 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1231 return {}
1232
1233 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1234 if isinstance(json_ld, compat_str):
1235 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1236 if not json_ld:
1237 return {}
1238 info = {}
1239 if not isinstance(json_ld, (list, tuple, dict)):
1240 return info
1241 if isinstance(json_ld, dict):
1242 json_ld = [json_ld]
1243
1244 INTERACTION_TYPE_MAP = {
1245 'CommentAction': 'comment',
1246 'AgreeAction': 'like',
1247 'DisagreeAction': 'dislike',
1248 'LikeAction': 'like',
1249 'DislikeAction': 'dislike',
1250 'ListenAction': 'view',
1251 'WatchAction': 'view',
1252 'ViewAction': 'view',
1253 }
1254
1255 def extract_interaction_type(e):
1256 interaction_type = e.get('interactionType')
1257 if isinstance(interaction_type, dict):
1258 interaction_type = interaction_type.get('@type')
1259 return str_or_none(interaction_type)
1260
1261 def extract_interaction_statistic(e):
1262 interaction_statistic = e.get('interactionStatistic')
1263 if isinstance(interaction_statistic, dict):
1264 interaction_statistic = [interaction_statistic]
1265 if not isinstance(interaction_statistic, list):
1266 return
1267 for is_e in interaction_statistic:
1268 if not isinstance(is_e, dict):
1269 continue
1270 if is_e.get('@type') != 'InteractionCounter':
1271 continue
1272 interaction_type = extract_interaction_type(is_e)
1273 if not interaction_type:
1274 continue
1275 # For interaction count some sites provide string instead of
1276 # an integer (as per spec) with non digit characters (e.g. ",")
1277 # so extracting count with more relaxed str_to_int
1278 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1279 if interaction_count is None:
1280 continue
1281 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1282 if not count_kind:
1283 continue
1284 count_key = '%s_count' % count_kind
1285 if info.get(count_key) is not None:
1286 continue
1287 info[count_key] = interaction_count
1288
1289 def extract_video_object(e):
1290 assert e['@type'] == 'VideoObject'
1291 info.update({
1292 'url': url_or_none(e.get('contentUrl')),
1293 'title': unescapeHTML(e.get('name')),
1294 'description': unescapeHTML(e.get('description')),
1295 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1296 'duration': parse_duration(e.get('duration')),
1297 'timestamp': unified_timestamp(e.get('uploadDate')),
1298 'uploader': str_or_none(e.get('author')),
1299 'filesize': float_or_none(e.get('contentSize')),
1300 'tbr': int_or_none(e.get('bitrate')),
1301 'width': int_or_none(e.get('width')),
1302 'height': int_or_none(e.get('height')),
1303 'view_count': int_or_none(e.get('interactionCount')),
1304 })
1305 extract_interaction_statistic(e)
1306
1307 for e in json_ld:
1308 if '@context' in e:
1309 item_type = e.get('@type')
1310 if expected_type is not None and expected_type != item_type:
1311 continue
1312 if item_type in ('TVEpisode', 'Episode'):
1313 episode_name = unescapeHTML(e.get('name'))
1314 info.update({
1315 'episode': episode_name,
1316 'episode_number': int_or_none(e.get('episodeNumber')),
1317 'description': unescapeHTML(e.get('description')),
1318 })
1319 if not info.get('title') and episode_name:
1320 info['title'] = episode_name
1321 part_of_season = e.get('partOfSeason')
1322 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1323 info.update({
1324 'season': unescapeHTML(part_of_season.get('name')),
1325 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1326 })
1327 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1328 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1329 info['series'] = unescapeHTML(part_of_series.get('name'))
1330 elif item_type == 'Movie':
1331 info.update({
1332 'title': unescapeHTML(e.get('name')),
1333 'description': unescapeHTML(e.get('description')),
1334 'duration': parse_duration(e.get('duration')),
1335 'timestamp': unified_timestamp(e.get('dateCreated')),
1336 })
1337 elif item_type in ('Article', 'NewsArticle'):
1338 info.update({
1339 'timestamp': parse_iso8601(e.get('datePublished')),
1340 'title': unescapeHTML(e.get('headline')),
1341 'description': unescapeHTML(e.get('articleBody')),
1342 })
1343 elif item_type == 'VideoObject':
1344 extract_video_object(e)
1345 if expected_type is None:
1346 continue
1347 else:
1348 break
1349 video = e.get('video')
1350 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1351 extract_video_object(video)
1352 if expected_type is None:
1353 continue
1354 else:
1355 break
1356 return dict((k, v) for k, v in info.items() if v is not None)
1357
1358 @staticmethod
1359 def _hidden_inputs(html):
1360 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1361 hidden_inputs = {}
1362 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1363 attrs = extract_attributes(input)
1364 if not input:
1365 continue
1366 if attrs.get('type') not in ('hidden', 'submit'):
1367 continue
1368 name = attrs.get('name') or attrs.get('id')
1369 value = attrs.get('value')
1370 if name and value is not None:
1371 hidden_inputs[name] = value
1372 return hidden_inputs
1373
1374 def _form_hidden_inputs(self, form_id, html):
1375 form = self._search_regex(
1376 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1377 html, '%s form' % form_id, group='form')
1378 return self._hidden_inputs(form)
1379
1380 class FormatSort:
1381 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1382
1383 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
1384 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1385 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
1386
1387 settings = {
1388 'vcodec': {'type': 'ordered', 'regex': True,
1389 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1390 'acodec': {'type': 'ordered', 'regex': True,
1391 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1392 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1393 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
1394 'vext': {'type': 'ordered', 'field': 'video_ext',
1395 'order': ('mp4', 'webm', 'flv', '', 'none'),
1396 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1397 'aext': {'type': 'ordered', 'field': 'audio_ext',
1398 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1399 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1400 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1401 'ie_pref': {'priority': True, 'type': 'extractor'},
1402 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1403 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1404 'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1405 'quality': {'convert': 'float_none', 'type': 'extractor'},
1406 'filesize': {'convert': 'bytes'},
1407 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1408 'id': {'convert': 'string', 'field': 'format_id'},
1409 'height': {'convert': 'float_none'},
1410 'width': {'convert': 'float_none'},
1411 'fps': {'convert': 'float_none'},
1412 'tbr': {'convert': 'float_none'},
1413 'vbr': {'convert': 'float_none'},
1414 'abr': {'convert': 'float_none'},
1415 'asr': {'convert': 'float_none'},
1416 'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1417
1418 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1419 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1420 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1421 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1422 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1423
1424 # Most of these exist only for compatibility reasons
1425 'dimension': {'type': 'alias', 'field': 'res'},
1426 'resolution': {'type': 'alias', 'field': 'res'},
1427 'extension': {'type': 'alias', 'field': 'ext'},
1428 'bitrate': {'type': 'alias', 'field': 'br'},
1429 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1430 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1431 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1432 'framerate': {'type': 'alias', 'field': 'fps'},
1433 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1434 'protocol': {'type': 'alias', 'field': 'proto'},
1435 'source_preference': {'type': 'alias', 'field': 'source'},
1436 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1437 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1438 'samplerate': {'type': 'alias', 'field': 'asr'},
1439 'video_ext': {'type': 'alias', 'field': 'vext'},
1440 'audio_ext': {'type': 'alias', 'field': 'aext'},
1441 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1442 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1443 'video': {'type': 'alias', 'field': 'hasvid'},
1444 'has_video': {'type': 'alias', 'field': 'hasvid'},
1445 'audio': {'type': 'alias', 'field': 'hasaud'},
1446 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1447 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1448 'preference': {'type': 'alias', 'field': 'ie_pref'},
1449 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1450 'format_id': {'type': 'alias', 'field': 'id'},
1451 }
1452
1453 _order = []
1454
1455 def _get_field_setting(self, field, key):
1456 if field not in self.settings:
1457 self.settings[field] = {}
1458 propObj = self.settings[field]
1459 if key not in propObj:
1460 type = propObj.get('type')
1461 if key == 'field':
1462 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1463 elif key == 'convert':
1464 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1465 else:
1466 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1467 propObj[key] = default
1468 return propObj[key]
1469
1470 def _resolve_field_value(self, field, value, convertNone=False):
1471 if value is None:
1472 if not convertNone:
1473 return None
1474 else:
1475 value = value.lower()
1476 conversion = self._get_field_setting(field, 'convert')
1477 if conversion == 'ignore':
1478 return None
1479 if conversion == 'string':
1480 return value
1481 elif conversion == 'float_none':
1482 return float_or_none(value)
1483 elif conversion == 'bytes':
1484 return FileDownloader.parse_bytes(value)
1485 elif conversion == 'order':
1486 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1487 use_regex = self._get_field_setting(field, 'regex')
1488 list_length = len(order_list)
1489 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1490 if use_regex and value is not None:
1491 for i, regex in enumerate(order_list):
1492 if regex and re.match(regex, value):
1493 return list_length - i
1494 return list_length - empty_pos # not in list
1495 else: # not regex or value = None
1496 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1497 else:
1498 if value.isnumeric():
1499 return float(value)
1500 else:
1501 self.settings[field]['convert'] = 'string'
1502 return value
1503
1504 def evaluate_params(self, params, sort_extractor):
1505 self._use_free_order = params.get('prefer_free_formats', False)
1506 self._sort_user = params.get('format_sort', [])
1507 self._sort_extractor = sort_extractor
1508
1509 def add_item(field, reverse, closest, limit_text):
1510 field = field.lower()
1511 if field in self._order:
1512 return
1513 self._order.append(field)
1514 limit = self._resolve_field_value(field, limit_text)
1515 data = {
1516 'reverse': reverse,
1517 'closest': False if limit is None else closest,
1518 'limit_text': limit_text,
1519 'limit': limit}
1520 if field in self.settings:
1521 self.settings[field].update(data)
1522 else:
1523 self.settings[field] = data
1524
1525 sort_list = (
1526 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1527 + (tuple() if params.get('format_sort_force', False)
1528 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1529 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1530
1531 for item in sort_list:
1532 match = re.match(self.regex, item)
1533 if match is None:
1534 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1535 field = match.group('field')
1536 if field is None:
1537 continue
1538 if self._get_field_setting(field, 'type') == 'alias':
1539 field = self._get_field_setting(field, 'field')
1540 reverse = match.group('reverse') is not None
1541 closest = match.group('seperator') == '~'
1542 limit_text = match.group('limit')
1543
1544 has_limit = limit_text is not None
1545 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1546 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1547
1548 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1549 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1550 limit_count = len(limits)
1551 for (i, f) in enumerate(fields):
1552 add_item(f, reverse, closest,
1553 limits[i] if i < limit_count
1554 else limits[0] if has_limit and not has_multiple_limits
1555 else None)
1556
1557 def print_verbose_info(self, to_screen):
1558 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1559 if self._sort_extractor:
1560 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1561 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1562 '+' if self._get_field_setting(field, 'reverse') else '', field,
1563 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1564 self._get_field_setting(field, 'limit_text'),
1565 self._get_field_setting(field, 'limit'))
1566 if self._get_field_setting(field, 'limit_text') is not None else '')
1567 for field in self._order if self._get_field_setting(field, 'visible')]))
1568
1569 def _calculate_field_preference_from_value(self, format, field, type, value):
1570 reverse = self._get_field_setting(field, 'reverse')
1571 closest = self._get_field_setting(field, 'closest')
1572 limit = self._get_field_setting(field, 'limit')
1573
1574 if type == 'extractor':
1575 maximum = self._get_field_setting(field, 'max')
1576 if value is None or (maximum is not None and value >= maximum):
1577 value = -1
1578 elif type == 'boolean':
1579 in_list = self._get_field_setting(field, 'in_list')
1580 not_in_list = self._get_field_setting(field, 'not_in_list')
1581 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1582 elif type == 'ordered':
1583 value = self._resolve_field_value(field, value, True)
1584
1585 # try to convert to number
1586 val_num = float_or_none(value)
1587 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1588 if is_num:
1589 value = val_num
1590
1591 return ((-10, 0) if value is None
1592 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1593 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1594 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1595 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1596 else (-1, value, 0))
1597
1598 def _calculate_field_preference(self, format, field):
1599 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1600 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1601 if type == 'multiple':
1602 type = 'field' # Only 'field' is allowed in multiple for now
1603 actual_fields = self._get_field_setting(field, 'field')
1604
1605 def wrapped_function(values):
1606 values = tuple(filter(lambda x: x is not None, values))
1607 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1608 else values[0] if values
1609 else None)
1610
1611 value = wrapped_function((get_value(f) for f in actual_fields))
1612 else:
1613 value = get_value(field)
1614 return self._calculate_field_preference_from_value(format, field, type, value)
1615
1616 def calculate_preference(self, format):
1617 # Determine missing protocol
1618 if not format.get('protocol'):
1619 format['protocol'] = determine_protocol(format)
1620
1621 # Determine missing ext
1622 if not format.get('ext') and 'url' in format:
1623 format['ext'] = determine_ext(format['url'])
1624 if format.get('vcodec') == 'none':
1625 format['audio_ext'] = format['ext']
1626 format['video_ext'] = 'none'
1627 else:
1628 format['video_ext'] = format['ext']
1629 format['audio_ext'] = 'none'
1630 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1631 # format['preference'] = -1000
1632
1633 # Determine missing bitrates
1634 if format.get('tbr') is None:
1635 if format.get('vbr') is not None and format.get('abr') is not None:
1636 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1637 else:
1638 if format.get('vcodec') != "none" and format.get('vbr') is None:
1639 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1640 if format.get('acodec') != "none" and format.get('abr') is None:
1641 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1642
1643 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1644
1645 def _sort_formats(self, formats, field_preference=[]):
1646 if not formats:
1647 raise ExtractorError('No video formats found')
1648 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1649 format_sort.evaluate_params(self._downloader.params, field_preference)
1650 if self._downloader.params.get('verbose', False):
1651 format_sort.print_verbose_info(self._downloader.to_screen)
1652 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1653
1654 def _check_formats(self, formats, video_id):
1655 if formats:
1656 formats[:] = filter(
1657 lambda f: self._is_valid_url(
1658 f['url'], video_id,
1659 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1660 formats)
1661
1662 @staticmethod
1663 def _remove_duplicate_formats(formats):
1664 format_urls = set()
1665 unique_formats = []
1666 for f in formats:
1667 if f['url'] not in format_urls:
1668 format_urls.add(f['url'])
1669 unique_formats.append(f)
1670 formats[:] = unique_formats
1671
1672 def _is_valid_url(self, url, video_id, item='video', headers={}):
1673 url = self._proto_relative_url(url, scheme='http:')
1674 # For now assume non HTTP(S) URLs always valid
1675 if not (url.startswith('http://') or url.startswith('https://')):
1676 return True
1677 try:
1678 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1679 return True
1680 except ExtractorError as e:
1681 self.to_screen(
1682 '%s: %s URL is invalid, skipping: %s'
1683 % (video_id, item, error_to_compat_str(e.cause)))
1684 return False
1685
1686 def http_scheme(self):
1687 """ Either "http:" or "https:", depending on the user's preferences """
1688 return (
1689 'http:'
1690 if self._downloader.params.get('prefer_insecure', False)
1691 else 'https:')
1692
1693 def _proto_relative_url(self, url, scheme=None):
1694 if url is None:
1695 return url
1696 if url.startswith('//'):
1697 if scheme is None:
1698 scheme = self.http_scheme()
1699 return scheme + url
1700 else:
1701 return url
1702
1703 def _sleep(self, timeout, video_id, msg_template=None):
1704 if msg_template is None:
1705 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1706 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1707 self.to_screen(msg)
1708 time.sleep(timeout)
1709
1710 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1711 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1712 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1713 manifest = self._download_xml(
1714 manifest_url, video_id, 'Downloading f4m manifest',
1715 'Unable to download f4m manifest',
1716 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1717 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1718 transform_source=transform_source,
1719 fatal=fatal, data=data, headers=headers, query=query)
1720
1721 if manifest is False:
1722 return []
1723
1724 return self._parse_f4m_formats(
1725 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1726 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1727
1728 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1729 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1730 fatal=True, m3u8_id=None):
1731 if not isinstance(manifest, compat_etree_Element) and not fatal:
1732 return []
1733
1734 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1735 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1736 if akamai_pv is not None and ';' in akamai_pv.text:
1737 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1738 if playerVerificationChallenge.strip() != '':
1739 return []
1740
1741 formats = []
1742 manifest_version = '1.0'
1743 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1744 if not media_nodes:
1745 manifest_version = '2.0'
1746 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1747 # Remove unsupported DRM protected media from final formats
1748 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1749 media_nodes = remove_encrypted_media(media_nodes)
1750 if not media_nodes:
1751 return formats
1752
1753 manifest_base_url = get_base_url(manifest)
1754
1755 bootstrap_info = xpath_element(
1756 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1757 'bootstrap info', default=None)
1758
1759 vcodec = None
1760 mime_type = xpath_text(
1761 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1762 'base URL', default=None)
1763 if mime_type and mime_type.startswith('audio/'):
1764 vcodec = 'none'
1765
1766 for i, media_el in enumerate(media_nodes):
1767 tbr = int_or_none(media_el.attrib.get('bitrate'))
1768 width = int_or_none(media_el.attrib.get('width'))
1769 height = int_or_none(media_el.attrib.get('height'))
1770 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1771 # If <bootstrapInfo> is present, the specified f4m is a
1772 # stream-level manifest, and only set-level manifests may refer to
1773 # external resources. See section 11.4 and section 4 of F4M spec
1774 if bootstrap_info is None:
1775 media_url = None
1776 # @href is introduced in 2.0, see section 11.6 of F4M spec
1777 if manifest_version == '2.0':
1778 media_url = media_el.attrib.get('href')
1779 if media_url is None:
1780 media_url = media_el.attrib.get('url')
1781 if not media_url:
1782 continue
1783 manifest_url = (
1784 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1785 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1786 # If media_url is itself a f4m manifest do the recursive extraction
1787 # since bitrates in parent manifest (this one) and media_url manifest
1788 # may differ leading to inability to resolve the format by requested
1789 # bitrate in f4m downloader
1790 ext = determine_ext(manifest_url)
1791 if ext == 'f4m':
1792 f4m_formats = self._extract_f4m_formats(
1793 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1794 transform_source=transform_source, fatal=fatal)
1795 # Sometimes stream-level manifest contains single media entry that
1796 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1797 # At the same time parent's media entry in set-level manifest may
1798 # contain it. We will copy it from parent in such cases.
1799 if len(f4m_formats) == 1:
1800 f = f4m_formats[0]
1801 f.update({
1802 'tbr': f.get('tbr') or tbr,
1803 'width': f.get('width') or width,
1804 'height': f.get('height') or height,
1805 'format_id': f.get('format_id') if not tbr else format_id,
1806 'vcodec': vcodec,
1807 })
1808 formats.extend(f4m_formats)
1809 continue
1810 elif ext == 'm3u8':
1811 formats.extend(self._extract_m3u8_formats(
1812 manifest_url, video_id, 'mp4', preference=preference,
1813 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1814 continue
1815 formats.append({
1816 'format_id': format_id,
1817 'url': manifest_url,
1818 'manifest_url': manifest_url,
1819 'ext': 'flv' if bootstrap_info is not None else None,
1820 'protocol': 'f4m',
1821 'tbr': tbr,
1822 'width': width,
1823 'height': height,
1824 'vcodec': vcodec,
1825 'preference': preference,
1826 'quality': quality,
1827 })
1828 return formats
1829
1830 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1831 return {
1832 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1833 'url': m3u8_url,
1834 'ext': ext,
1835 'protocol': 'm3u8',
1836 'preference': preference - 100 if preference else -100,
1837 'quality': quality,
1838 'resolution': 'multiple',
1839 'format_note': 'Quality selection URL',
1840 }
1841
1842 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1843 entry_protocol='m3u8', preference=None, quality=None,
1844 m3u8_id=None, live=False, note=None, errnote=None,
1845 fatal=True, data=None, headers={}, query={}):
1846 res = self._download_webpage_handle(
1847 m3u8_url, video_id,
1848 note=note or 'Downloading m3u8 information',
1849 errnote=errnote or 'Failed to download m3u8 information',
1850 fatal=fatal, data=data, headers=headers, query=query)
1851
1852 if res is False:
1853 return []
1854
1855 m3u8_doc, urlh = res
1856 m3u8_url = urlh.geturl()
1857
1858 return self._parse_m3u8_formats(
1859 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1860 preference=preference, quality=quality, m3u8_id=m3u8_id,
1861 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1862 headers=headers, query=query, video_id=video_id)
1863
1864 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1865 entry_protocol='m3u8', preference=None, quality=None,
1866 m3u8_id=None, live=False, note=None, errnote=None,
1867 fatal=True, data=None, headers={}, query={}, video_id=None):
1868 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1869 return []
1870
1871 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1872 return []
1873
1874 formats = []
1875
1876 format_url = lambda u: (
1877 u
1878 if re.match(r'^https?://', u)
1879 else compat_urlparse.urljoin(m3u8_url, u))
1880
1881 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1882
1883 # References:
1884 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1885 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1886 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1887
1888 # We should try extracting formats only from master playlists [1, 4.3.4],
1889 # i.e. playlists that describe available qualities. On the other hand
1890 # media playlists [1, 4.3.3] should be returned as is since they contain
1891 # just the media without qualities renditions.
1892 # Fortunately, master playlist can be easily distinguished from media
1893 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1894 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1895 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1896 # media playlist and MUST NOT appear in master playlist thus we can
1897 # clearly detect media playlist with this criterion.
1898
1899 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
1900 if not m3u8_doc:
1901 if not format_url:
1902 return []
1903 res = self._download_webpage_handle(
1904 format_url, video_id,
1905 note=False,
1906 errnote=errnote or 'Failed to download m3u8 playlist information',
1907 fatal=fatal, data=data, headers=headers, query=query)
1908
1909 if res is False:
1910 return []
1911
1912 m3u8_doc, urlh = res
1913 format_url = urlh.geturl()
1914
1915 playlist_formats = []
1916 i = (
1917 0
1918 if split_discontinuity
1919 else None)
1920 format_info = {
1921 'index': i,
1922 'key_data': None,
1923 'files': [],
1924 }
1925 for line in m3u8_doc.splitlines():
1926 if not line.startswith('#'):
1927 format_info['files'].append(line)
1928 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1929 i += 1
1930 playlist_formats.append(format_info)
1931 format_info = {
1932 'index': i,
1933 'url': format_url,
1934 'files': [],
1935 }
1936 playlist_formats.append(format_info)
1937 return playlist_formats
1938
1939 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1940
1941 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1942
1943 for format in playlist_formats:
1944 format_id = []
1945 if m3u8_id:
1946 format_id.append(m3u8_id)
1947 format_index = format.get('index')
1948 if format_index:
1949 format_id.append(str(format_index))
1950 f = {
1951 'format_id': '-'.join(format_id),
1952 'format_index': format_index,
1953 'url': m3u8_url,
1954 'ext': ext,
1955 'protocol': entry_protocol,
1956 'preference': preference,
1957 'quality': quality,
1958 }
1959 formats.append(f)
1960
1961 return formats
1962
1963 groups = {}
1964 last_stream_inf = {}
1965
1966 def extract_media(x_media_line):
1967 media = parse_m3u8_attributes(x_media_line)
1968 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1969 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1970 if not (media_type and group_id and name):
1971 return
1972 groups.setdefault(group_id, []).append(media)
1973 if media_type not in ('VIDEO', 'AUDIO'):
1974 return
1975 media_url = media.get('URI')
1976 if media_url:
1977 manifest_url = format_url(media_url)
1978 format_id = []
1979 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1980
1981 for format in playlist_formats:
1982 format_index = format.get('index')
1983 for v in (m3u8_id, group_id, name):
1984 if v:
1985 format_id.append(v)
1986 if format_index:
1987 format_id.append(str(format_index))
1988 f = {
1989 'format_id': '-'.join(format_id),
1990 'format_index': format_index,
1991 'url': manifest_url,
1992 'manifest_url': m3u8_url,
1993 'language': media.get('LANGUAGE'),
1994 'ext': ext,
1995 'protocol': entry_protocol,
1996 'preference': preference,
1997 'quality': quality,
1998 }
1999 if media_type == 'AUDIO':
2000 f['vcodec'] = 'none'
2001 formats.append(f)
2002
2003 def build_stream_name():
2004 # Despite specification does not mention NAME attribute for
2005 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2006 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2007 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2008 stream_name = last_stream_inf.get('NAME')
2009 if stream_name:
2010 return stream_name
2011 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2012 # from corresponding rendition group
2013 stream_group_id = last_stream_inf.get('VIDEO')
2014 if not stream_group_id:
2015 return
2016 stream_group = groups.get(stream_group_id)
2017 if not stream_group:
2018 return stream_group_id
2019 rendition = stream_group[0]
2020 return rendition.get('NAME') or stream_group_id
2021
2022 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2023 # chance to detect video only formats when EXT-X-STREAM-INF tags
2024 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2025 for line in m3u8_doc.splitlines():
2026 if line.startswith('#EXT-X-MEDIA:'):
2027 extract_media(line)
2028
2029 for line in m3u8_doc.splitlines():
2030 if line.startswith('#EXT-X-STREAM-INF:'):
2031 last_stream_inf = parse_m3u8_attributes(line)
2032 elif line.startswith('#') or not line.strip():
2033 continue
2034 else:
2035 tbr = float_or_none(
2036 last_stream_inf.get('AVERAGE-BANDWIDTH')
2037 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2038 manifest_url = format_url(line.strip())
2039
2040 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2041
2042 for format in playlist_formats:
2043 format_id = []
2044 if m3u8_id:
2045 format_id.append(m3u8_id)
2046 format_index = format.get('index')
2047 stream_name = build_stream_name()
2048 # Bandwidth of live streams may differ over time thus making
2049 # format_id unpredictable. So it's better to keep provided
2050 # format_id intact.
2051 if not live:
2052 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2053 if format_index:
2054 format_id.append(str(format_index))
2055 f = {
2056 'format_id': '-'.join(format_id),
2057 'format_index': format_index,
2058 'url': manifest_url,
2059 'manifest_url': m3u8_url,
2060 'tbr': tbr,
2061 'ext': ext,
2062 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2063 'protocol': entry_protocol,
2064 'preference': preference,
2065 'quality': quality,
2066 }
2067 resolution = last_stream_inf.get('RESOLUTION')
2068 if resolution:
2069 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2070 if mobj:
2071 f['width'] = int(mobj.group('width'))
2072 f['height'] = int(mobj.group('height'))
2073 # Unified Streaming Platform
2074 mobj = re.search(
2075 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2076 if mobj:
2077 abr, vbr = mobj.groups()
2078 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2079 f.update({
2080 'vbr': vbr,
2081 'abr': abr,
2082 })
2083 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2084 f.update(codecs)
2085 audio_group_id = last_stream_inf.get('AUDIO')
2086 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2087 # references a rendition group MUST have a CODECS attribute.
2088 # However, this is not always respected, for example, [2]
2089 # contains EXT-X-STREAM-INF tag which references AUDIO
2090 # rendition group but does not have CODECS and despite
2091 # referencing an audio group it represents a complete
2092 # (with audio and video) format. So, for such cases we will
2093 # ignore references to rendition groups and treat them
2094 # as complete formats.
2095 if audio_group_id and codecs and f.get('vcodec') != 'none':
2096 audio_group = groups.get(audio_group_id)
2097 if audio_group and audio_group[0].get('URI'):
2098 # TODO: update acodec for audio only formats with
2099 # the same GROUP-ID
2100 f['acodec'] = 'none'
2101 formats.append(f)
2102
2103 # for DailyMotion
2104 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2105 if progressive_uri:
2106 http_f = f.copy()
2107 del http_f['manifest_url']
2108 http_f.update({
2109 'format_id': f['format_id'].replace('hls-', 'http-'),
2110 'protocol': 'http',
2111 'url': progressive_uri,
2112 })
2113 formats.append(http_f)
2114
2115 last_stream_inf = {}
2116 return formats
2117
2118 @staticmethod
2119 def _xpath_ns(path, namespace=None):
2120 if not namespace:
2121 return path
2122 out = []
2123 for c in path.split('/'):
2124 if not c or c == '.':
2125 out.append(c)
2126 else:
2127 out.append('{%s}%s' % (namespace, c))
2128 return '/'.join(out)
2129
2130 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2131 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2132
2133 if smil is False:
2134 assert not fatal
2135 return []
2136
2137 namespace = self._parse_smil_namespace(smil)
2138
2139 return self._parse_smil_formats(
2140 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2141
2142 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2143 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2144 if smil is False:
2145 return {}
2146 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2147
2148 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2149 return self._download_xml(
2150 smil_url, video_id, 'Downloading SMIL file',
2151 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2152
2153 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2154 namespace = self._parse_smil_namespace(smil)
2155
2156 formats = self._parse_smil_formats(
2157 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2158 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2159
2160 video_id = os.path.splitext(url_basename(smil_url))[0]
2161 title = None
2162 description = None
2163 upload_date = None
2164 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2165 name = meta.attrib.get('name')
2166 content = meta.attrib.get('content')
2167 if not name or not content:
2168 continue
2169 if not title and name == 'title':
2170 title = content
2171 elif not description and name in ('description', 'abstract'):
2172 description = content
2173 elif not upload_date and name == 'date':
2174 upload_date = unified_strdate(content)
2175
2176 thumbnails = [{
2177 'id': image.get('type'),
2178 'url': image.get('src'),
2179 'width': int_or_none(image.get('width')),
2180 'height': int_or_none(image.get('height')),
2181 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2182
2183 return {
2184 'id': video_id,
2185 'title': title or video_id,
2186 'description': description,
2187 'upload_date': upload_date,
2188 'thumbnails': thumbnails,
2189 'formats': formats,
2190 'subtitles': subtitles,
2191 }
2192
2193 def _parse_smil_namespace(self, smil):
2194 return self._search_regex(
2195 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2196
2197 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2198 base = smil_url
2199 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2200 b = meta.get('base') or meta.get('httpBase')
2201 if b:
2202 base = b
2203 break
2204
2205 formats = []
2206 rtmp_count = 0
2207 http_count = 0
2208 m3u8_count = 0
2209
2210 srcs = []
2211 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2212 for medium in media:
2213 src = medium.get('src')
2214 if not src or src in srcs:
2215 continue
2216 srcs.append(src)
2217
2218 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2219 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2220 width = int_or_none(medium.get('width'))
2221 height = int_or_none(medium.get('height'))
2222 proto = medium.get('proto')
2223 ext = medium.get('ext')
2224 src_ext = determine_ext(src)
2225 streamer = medium.get('streamer') or base
2226
2227 if proto == 'rtmp' or streamer.startswith('rtmp'):
2228 rtmp_count += 1
2229 formats.append({
2230 'url': streamer,
2231 'play_path': src,
2232 'ext': 'flv',
2233 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2234 'tbr': bitrate,
2235 'filesize': filesize,
2236 'width': width,
2237 'height': height,
2238 })
2239 if transform_rtmp_url:
2240 streamer, src = transform_rtmp_url(streamer, src)
2241 formats[-1].update({
2242 'url': streamer,
2243 'play_path': src,
2244 })
2245 continue
2246
2247 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2248 src_url = src_url.strip()
2249
2250 if proto == 'm3u8' or src_ext == 'm3u8':
2251 m3u8_formats = self._extract_m3u8_formats(
2252 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2253 if len(m3u8_formats) == 1:
2254 m3u8_count += 1
2255 m3u8_formats[0].update({
2256 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2257 'tbr': bitrate,
2258 'width': width,
2259 'height': height,
2260 })
2261 formats.extend(m3u8_formats)
2262 elif src_ext == 'f4m':
2263 f4m_url = src_url
2264 if not f4m_params:
2265 f4m_params = {
2266 'hdcore': '3.2.0',
2267 'plugin': 'flowplayer-3.2.0.1',
2268 }
2269 f4m_url += '&' if '?' in f4m_url else '?'
2270 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2271 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2272 elif src_ext == 'mpd':
2273 formats.extend(self._extract_mpd_formats(
2274 src_url, video_id, mpd_id='dash', fatal=False))
2275 elif re.search(r'\.ism/[Mm]anifest', src_url):
2276 formats.extend(self._extract_ism_formats(
2277 src_url, video_id, ism_id='mss', fatal=False))
2278 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2279 http_count += 1
2280 formats.append({
2281 'url': src_url,
2282 'ext': ext or src_ext or 'flv',
2283 'format_id': 'http-%d' % (bitrate or http_count),
2284 'tbr': bitrate,
2285 'filesize': filesize,
2286 'width': width,
2287 'height': height,
2288 })
2289
2290 return formats
2291
2292 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2293 urls = []
2294 subtitles = {}
2295 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2296 src = textstream.get('src')
2297 if not src or src in urls:
2298 continue
2299 urls.append(src)
2300 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2301 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2302 subtitles.setdefault(lang, []).append({
2303 'url': src,
2304 'ext': ext,
2305 })
2306 return subtitles
2307
2308 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2309 xspf = self._download_xml(
2310 xspf_url, playlist_id, 'Downloading xpsf playlist',
2311 'Unable to download xspf manifest', fatal=fatal)
2312 if xspf is False:
2313 return []
2314 return self._parse_xspf(
2315 xspf, playlist_id, xspf_url=xspf_url,
2316 xspf_base_url=base_url(xspf_url))
2317
2318 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2319 NS_MAP = {
2320 'xspf': 'http://xspf.org/ns/0/',
2321 's1': 'http://static.streamone.nl/player/ns/0',
2322 }
2323
2324 entries = []
2325 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2326 title = xpath_text(
2327 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2328 description = xpath_text(
2329 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2330 thumbnail = xpath_text(
2331 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2332 duration = float_or_none(
2333 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2334
2335 formats = []
2336 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2337 format_url = urljoin(xspf_base_url, location.text)
2338 if not format_url:
2339 continue
2340 formats.append({
2341 'url': format_url,
2342 'manifest_url': xspf_url,
2343 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2344 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2345 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2346 })
2347 self._sort_formats(formats)
2348
2349 entries.append({
2350 'id': playlist_id,
2351 'title': title,
2352 'description': description,
2353 'thumbnail': thumbnail,
2354 'duration': duration,
2355 'formats': formats,
2356 })
2357 return entries
2358
2359 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2360 res = self._download_xml_handle(
2361 mpd_url, video_id,
2362 note=note or 'Downloading MPD manifest',
2363 errnote=errnote or 'Failed to download MPD manifest',
2364 fatal=fatal, data=data, headers=headers, query=query)
2365 if res is False:
2366 return []
2367 mpd_doc, urlh = res
2368 if mpd_doc is None:
2369 return []
2370 mpd_base_url = base_url(urlh.geturl())
2371
2372 return self._parse_mpd_formats(
2373 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2374
2375 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2376 """
2377 Parse formats from MPD manifest.
2378 References:
2379 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2380 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2381 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2382 """
2383 if not self._downloader.params.get('dynamic_mpd'):
2384 if mpd_doc.get('type') == 'dynamic':
2385 return []
2386
2387 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2388
2389 def _add_ns(path):
2390 return self._xpath_ns(path, namespace)
2391
2392 def is_drm_protected(element):
2393 return element.find(_add_ns('ContentProtection')) is not None
2394
2395 def extract_multisegment_info(element, ms_parent_info):
2396 ms_info = ms_parent_info.copy()
2397
2398 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2399 # common attributes and elements. We will only extract relevant
2400 # for us.
2401 def extract_common(source):
2402 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2403 if segment_timeline is not None:
2404 s_e = segment_timeline.findall(_add_ns('S'))
2405 if s_e:
2406 ms_info['total_number'] = 0
2407 ms_info['s'] = []
2408 for s in s_e:
2409 r = int(s.get('r', 0))
2410 ms_info['total_number'] += 1 + r
2411 ms_info['s'].append({
2412 't': int(s.get('t', 0)),
2413 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2414 'd': int(s.attrib['d']),
2415 'r': r,
2416 })
2417 start_number = source.get('startNumber')
2418 if start_number:
2419 ms_info['start_number'] = int(start_number)
2420 timescale = source.get('timescale')
2421 if timescale:
2422 ms_info['timescale'] = int(timescale)
2423 segment_duration = source.get('duration')
2424 if segment_duration:
2425 ms_info['segment_duration'] = float(segment_duration)
2426
2427 def extract_Initialization(source):
2428 initialization = source.find(_add_ns('Initialization'))
2429 if initialization is not None:
2430 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2431
2432 segment_list = element.find(_add_ns('SegmentList'))
2433 if segment_list is not None:
2434 extract_common(segment_list)
2435 extract_Initialization(segment_list)
2436 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2437 if segment_urls_e:
2438 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2439 else:
2440 segment_template = element.find(_add_ns('SegmentTemplate'))
2441 if segment_template is not None:
2442 extract_common(segment_template)
2443 media = segment_template.get('media')
2444 if media:
2445 ms_info['media'] = media
2446 initialization = segment_template.get('initialization')
2447 if initialization:
2448 ms_info['initialization'] = initialization
2449 else:
2450 extract_Initialization(segment_template)
2451 return ms_info
2452
2453 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2454
2455 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2456 formats = []
2457 for period in mpd_doc.findall(_add_ns('Period')):
2458 period_duration = parse_duration(period.get('duration')) or mpd_duration
2459 period_ms_info = extract_multisegment_info(period, {
2460 'start_number': 1,
2461 'timescale': 1,
2462 })
2463 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2464 if skip_unplayable and is_drm_protected(adaptation_set):
2465 continue
2466 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2467 for representation in adaptation_set.findall(_add_ns('Representation')):
2468 if skip_unplayable and is_drm_protected(representation):
2469 continue
2470 representation_attrib = adaptation_set.attrib.copy()
2471 representation_attrib.update(representation.attrib)
2472 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2473 mime_type = representation_attrib['mimeType']
2474 content_type = mime_type.split('/')[0]
2475 if content_type == 'text':
2476 # TODO implement WebVTT downloading
2477 pass
2478 elif content_type in ('video', 'audio'):
2479 base_url = ''
2480 for element in (representation, adaptation_set, period, mpd_doc):
2481 base_url_e = element.find(_add_ns('BaseURL'))
2482 if base_url_e is not None:
2483 base_url = base_url_e.text + base_url
2484 if re.match(r'^https?://', base_url):
2485 break
2486 if mpd_base_url and not re.match(r'^https?://', base_url):
2487 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2488 mpd_base_url += '/'
2489 base_url = mpd_base_url + base_url
2490 representation_id = representation_attrib.get('id')
2491 lang = representation_attrib.get('lang')
2492 url_el = representation.find(_add_ns('BaseURL'))
2493 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2494 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2495 f = {
2496 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2497 'manifest_url': mpd_url,
2498 'ext': mimetype2ext(mime_type),
2499 'width': int_or_none(representation_attrib.get('width')),
2500 'height': int_or_none(representation_attrib.get('height')),
2501 'tbr': float_or_none(bandwidth, 1000),
2502 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2503 'fps': int_or_none(representation_attrib.get('frameRate')),
2504 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2505 'format_note': 'DASH %s' % content_type,
2506 'filesize': filesize,
2507 'container': mimetype2ext(mime_type) + '_dash',
2508 }
2509 f.update(parse_codecs(representation_attrib.get('codecs')))
2510 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2511
2512 def prepare_template(template_name, identifiers):
2513 tmpl = representation_ms_info[template_name]
2514 # First of, % characters outside $...$ templates
2515 # must be escaped by doubling for proper processing
2516 # by % operator string formatting used further (see
2517 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2518 t = ''
2519 in_template = False
2520 for c in tmpl:
2521 t += c
2522 if c == '$':
2523 in_template = not in_template
2524 elif c == '%' and not in_template:
2525 t += c
2526 # Next, $...$ templates are translated to their
2527 # %(...) counterparts to be used with % operator
2528 t = t.replace('$RepresentationID$', representation_id)
2529 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2530 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2531 t.replace('$$', '$')
2532 return t
2533
2534 # @initialization is a regular template like @media one
2535 # so it should be handled just the same way (see
2536 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2537 if 'initialization' in representation_ms_info:
2538 initialization_template = prepare_template(
2539 'initialization',
2540 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2541 # $Time$ shall not be included for @initialization thus
2542 # only $Bandwidth$ remains
2543 ('Bandwidth', ))
2544 representation_ms_info['initialization_url'] = initialization_template % {
2545 'Bandwidth': bandwidth,
2546 }
2547
2548 def location_key(location):
2549 return 'url' if re.match(r'^https?://', location) else 'path'
2550
2551 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2552
2553 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2554 media_location_key = location_key(media_template)
2555
2556 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2557 # can't be used at the same time
2558 if '%(Number' in media_template and 's' not in representation_ms_info:
2559 segment_duration = None
2560 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2561 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2562 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2563 representation_ms_info['fragments'] = [{
2564 media_location_key: media_template % {
2565 'Number': segment_number,
2566 'Bandwidth': bandwidth,
2567 },
2568 'duration': segment_duration,
2569 } for segment_number in range(
2570 representation_ms_info['start_number'],
2571 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2572 else:
2573 # $Number*$ or $Time$ in media template with S list available
2574 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2575 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2576 representation_ms_info['fragments'] = []
2577 segment_time = 0
2578 segment_d = None
2579 segment_number = representation_ms_info['start_number']
2580
2581 def add_segment_url():
2582 segment_url = media_template % {
2583 'Time': segment_time,
2584 'Bandwidth': bandwidth,
2585 'Number': segment_number,
2586 }
2587 representation_ms_info['fragments'].append({
2588 media_location_key: segment_url,
2589 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2590 })
2591
2592 for num, s in enumerate(representation_ms_info['s']):
2593 segment_time = s.get('t') or segment_time
2594 segment_d = s['d']
2595 add_segment_url()
2596 segment_number += 1
2597 for r in range(s.get('r', 0)):
2598 segment_time += segment_d
2599 add_segment_url()
2600 segment_number += 1
2601 segment_time += segment_d
2602 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2603 # No media template
2604 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2605 # or any YouTube dashsegments video
2606 fragments = []
2607 segment_index = 0
2608 timescale = representation_ms_info['timescale']
2609 for s in representation_ms_info['s']:
2610 duration = float_or_none(s['d'], timescale)
2611 for r in range(s.get('r', 0) + 1):
2612 segment_uri = representation_ms_info['segment_urls'][segment_index]
2613 fragments.append({
2614 location_key(segment_uri): segment_uri,
2615 'duration': duration,
2616 })
2617 segment_index += 1
2618 representation_ms_info['fragments'] = fragments
2619 elif 'segment_urls' in representation_ms_info:
2620 # Segment URLs with no SegmentTimeline
2621 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2622 # https://github.com/ytdl-org/youtube-dl/pull/14844
2623 fragments = []
2624 segment_duration = float_or_none(
2625 representation_ms_info['segment_duration'],
2626 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2627 for segment_url in representation_ms_info['segment_urls']:
2628 fragment = {
2629 location_key(segment_url): segment_url,
2630 }
2631 if segment_duration:
2632 fragment['duration'] = segment_duration
2633 fragments.append(fragment)
2634 representation_ms_info['fragments'] = fragments
2635 # If there is a fragments key available then we correctly recognized fragmented media.
2636 # Otherwise we will assume unfragmented media with direct access. Technically, such
2637 # assumption is not necessarily correct since we may simply have no support for
2638 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2639 if 'fragments' in representation_ms_info:
2640 f.update({
2641 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2642 'url': mpd_url or base_url,
2643 'fragment_base_url': base_url,
2644 'fragments': [],
2645 'protocol': 'http_dash_segments',
2646 })
2647 if 'initialization_url' in representation_ms_info:
2648 initialization_url = representation_ms_info['initialization_url']
2649 if not f.get('url'):
2650 f['url'] = initialization_url
2651 f['fragments'].append({location_key(initialization_url): initialization_url})
2652 f['fragments'].extend(representation_ms_info['fragments'])
2653 else:
2654 # Assuming direct URL to unfragmented media.
2655 f['url'] = base_url
2656 formats.append(f)
2657 else:
2658 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2659 return formats
2660
2661 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2662 res = self._download_xml_handle(
2663 ism_url, video_id,
2664 note=note or 'Downloading ISM manifest',
2665 errnote=errnote or 'Failed to download ISM manifest',
2666 fatal=fatal, data=data, headers=headers, query=query)
2667 if res is False:
2668 return []
2669 ism_doc, urlh = res
2670 if ism_doc is None:
2671 return []
2672
2673 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2674
2675 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2676 """
2677 Parse formats from ISM manifest.
2678 References:
2679 1. [MS-SSTR]: Smooth Streaming Protocol,
2680 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2681 """
2682 if ism_doc.get('IsLive') == 'TRUE':
2683 return []
2684 if (not self._downloader.params.get('allow_unplayable_formats')
2685 and ism_doc.find('Protection') is not None):
2686 return []
2687
2688 duration = int(ism_doc.attrib['Duration'])
2689 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2690
2691 formats = []
2692 for stream in ism_doc.findall('StreamIndex'):
2693 stream_type = stream.get('Type')
2694 if stream_type not in ('video', 'audio'):
2695 continue
2696 url_pattern = stream.attrib['Url']
2697 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2698 stream_name = stream.get('Name')
2699 for track in stream.findall('QualityLevel'):
2700 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2701 # TODO: add support for WVC1 and WMAP
2702 if fourcc not in ('H264', 'AVC1', 'AACL'):
2703 self.report_warning('%s is not a supported codec' % fourcc)
2704 continue
2705 tbr = int(track.attrib['Bitrate']) // 1000
2706 # [1] does not mention Width and Height attributes. However,
2707 # they're often present while MaxWidth and MaxHeight are
2708 # missing, so should be used as fallbacks
2709 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2710 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2711 sampling_rate = int_or_none(track.get('SamplingRate'))
2712
2713 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2714 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2715
2716 fragments = []
2717 fragment_ctx = {
2718 'time': 0,
2719 }
2720 stream_fragments = stream.findall('c')
2721 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2722 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2723 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2724 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2725 if not fragment_ctx['duration']:
2726 try:
2727 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2728 except IndexError:
2729 next_fragment_time = duration
2730 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2731 for _ in range(fragment_repeat):
2732 fragments.append({
2733 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2734 'duration': fragment_ctx['duration'] / stream_timescale,
2735 })
2736 fragment_ctx['time'] += fragment_ctx['duration']
2737
2738 format_id = []
2739 if ism_id:
2740 format_id.append(ism_id)
2741 if stream_name:
2742 format_id.append(stream_name)
2743 format_id.append(compat_str(tbr))
2744
2745 formats.append({
2746 'format_id': '-'.join(format_id),
2747 'url': ism_url,
2748 'manifest_url': ism_url,
2749 'ext': 'ismv' if stream_type == 'video' else 'isma',
2750 'width': width,
2751 'height': height,
2752 'tbr': tbr,
2753 'asr': sampling_rate,
2754 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2755 'acodec': 'none' if stream_type == 'video' else fourcc,
2756 'protocol': 'ism',
2757 'fragments': fragments,
2758 '_download_params': {
2759 'duration': duration,
2760 'timescale': stream_timescale,
2761 'width': width or 0,
2762 'height': height or 0,
2763 'fourcc': fourcc,
2764 'codec_private_data': track.get('CodecPrivateData'),
2765 'sampling_rate': sampling_rate,
2766 'channels': int_or_none(track.get('Channels', 2)),
2767 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2768 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2769 },
2770 })
2771 return formats
2772
2773 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2774 def absolute_url(item_url):
2775 return urljoin(base_url, item_url)
2776
2777 def parse_content_type(content_type):
2778 if not content_type:
2779 return {}
2780 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2781 if ctr:
2782 mimetype, codecs = ctr.groups()
2783 f = parse_codecs(codecs)
2784 f['ext'] = mimetype2ext(mimetype)
2785 return f
2786 return {}
2787
2788 def _media_formats(src, cur_media_type, type_info={}):
2789 full_url = absolute_url(src)
2790 ext = type_info.get('ext') or determine_ext(full_url)
2791 if ext == 'm3u8':
2792 is_plain_url = False
2793 formats = self._extract_m3u8_formats(
2794 full_url, video_id, ext='mp4',
2795 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2796 preference=preference, quality=quality, fatal=False)
2797 elif ext == 'mpd':
2798 is_plain_url = False
2799 formats = self._extract_mpd_formats(
2800 full_url, video_id, mpd_id=mpd_id, fatal=False)
2801 else:
2802 is_plain_url = True
2803 formats = [{
2804 'url': full_url,
2805 'vcodec': 'none' if cur_media_type == 'audio' else None,
2806 }]
2807 return is_plain_url, formats
2808
2809 entries = []
2810 # amp-video and amp-audio are very similar to their HTML5 counterparts
2811 # so we wll include them right here (see
2812 # https://www.ampproject.org/docs/reference/components/amp-video)
2813 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2814 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2815 media_tags = [(media_tag, media_tag_name, media_type, '')
2816 for media_tag, media_tag_name, media_type
2817 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2818 media_tags.extend(re.findall(
2819 # We only allow video|audio followed by a whitespace or '>'.
2820 # Allowing more characters may end up in significant slow down (see
2821 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2822 # http://www.porntrex.com/maps/videositemap.xml).
2823 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2824 for media_tag, _, media_type, media_content in media_tags:
2825 media_info = {
2826 'formats': [],
2827 'subtitles': {},
2828 }
2829 media_attributes = extract_attributes(media_tag)
2830 src = strip_or_none(media_attributes.get('src'))
2831 if src:
2832 _, formats = _media_formats(src, media_type)
2833 media_info['formats'].extend(formats)
2834 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2835 if media_content:
2836 for source_tag in re.findall(r'<source[^>]+>', media_content):
2837 s_attr = extract_attributes(source_tag)
2838 # data-video-src and data-src are non standard but seen
2839 # several times in the wild
2840 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2841 if not src:
2842 continue
2843 f = parse_content_type(s_attr.get('type'))
2844 is_plain_url, formats = _media_formats(src, media_type, f)
2845 if is_plain_url:
2846 # width, height, res, label and title attributes are
2847 # all not standard but seen several times in the wild
2848 labels = [
2849 s_attr.get(lbl)
2850 for lbl in ('label', 'title')
2851 if str_or_none(s_attr.get(lbl))
2852 ]
2853 width = int_or_none(s_attr.get('width'))
2854 height = (int_or_none(s_attr.get('height'))
2855 or int_or_none(s_attr.get('res')))
2856 if not width or not height:
2857 for lbl in labels:
2858 resolution = parse_resolution(lbl)
2859 if not resolution:
2860 continue
2861 width = width or resolution.get('width')
2862 height = height or resolution.get('height')
2863 for lbl in labels:
2864 tbr = parse_bitrate(lbl)
2865 if tbr:
2866 break
2867 else:
2868 tbr = None
2869 f.update({
2870 'width': width,
2871 'height': height,
2872 'tbr': tbr,
2873 'format_id': s_attr.get('label') or s_attr.get('title'),
2874 })
2875 f.update(formats[0])
2876 media_info['formats'].append(f)
2877 else:
2878 media_info['formats'].extend(formats)
2879 for track_tag in re.findall(r'<track[^>]+>', media_content):
2880 track_attributes = extract_attributes(track_tag)
2881 kind = track_attributes.get('kind')
2882 if not kind or kind in ('subtitles', 'captions'):
2883 src = strip_or_none(track_attributes.get('src'))
2884 if not src:
2885 continue
2886 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2887 media_info['subtitles'].setdefault(lang, []).append({
2888 'url': absolute_url(src),
2889 })
2890 for f in media_info['formats']:
2891 f.setdefault('http_headers', {})['Referer'] = base_url
2892 if media_info['formats'] or media_info['subtitles']:
2893 entries.append(media_info)
2894 return entries
2895
2896 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2897 signed = 'hdnea=' in manifest_url
2898 if not signed:
2899 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2900 manifest_url = re.sub(
2901 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2902 '', manifest_url).strip('?')
2903
2904 formats = []
2905
2906 hdcore_sign = 'hdcore=3.7.0'
2907 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2908 hds_host = hosts.get('hds')
2909 if hds_host:
2910 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2911 if 'hdcore=' not in f4m_url:
2912 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2913 f4m_formats = self._extract_f4m_formats(
2914 f4m_url, video_id, f4m_id='hds', fatal=False)
2915 for entry in f4m_formats:
2916 entry.update({'extra_param_to_segment_url': hdcore_sign})
2917 formats.extend(f4m_formats)
2918
2919 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2920 hls_host = hosts.get('hls')
2921 if hls_host:
2922 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2923 m3u8_formats = self._extract_m3u8_formats(
2924 m3u8_url, video_id, 'mp4', 'm3u8_native',
2925 m3u8_id='hls', fatal=False)
2926 formats.extend(m3u8_formats)
2927
2928 http_host = hosts.get('http')
2929 if http_host and m3u8_formats and not signed:
2930 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2931 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2932 qualities_length = len(qualities)
2933 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2934 i = 0
2935 for f in m3u8_formats:
2936 if f['vcodec'] != 'none':
2937 for protocol in ('http', 'https'):
2938 http_f = f.copy()
2939 del http_f['manifest_url']
2940 http_url = re.sub(
2941 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2942 http_f.update({
2943 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2944 'url': http_url,
2945 'protocol': protocol,
2946 })
2947 formats.append(http_f)
2948 i += 1
2949
2950 return formats
2951
2952 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2953 query = compat_urlparse.urlparse(url).query
2954 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2955 mobj = re.search(
2956 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2957 url_base = mobj.group('url')
2958 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2959 formats = []
2960
2961 def manifest_url(manifest):
2962 m_url = '%s/%s' % (http_base_url, manifest)
2963 if query:
2964 m_url += '?%s' % query
2965 return m_url
2966
2967 if 'm3u8' not in skip_protocols:
2968 formats.extend(self._extract_m3u8_formats(
2969 manifest_url('playlist.m3u8'), video_id, 'mp4',
2970 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2971 if 'f4m' not in skip_protocols:
2972 formats.extend(self._extract_f4m_formats(
2973 manifest_url('manifest.f4m'),
2974 video_id, f4m_id='hds', fatal=False))
2975 if 'dash' not in skip_protocols:
2976 formats.extend(self._extract_mpd_formats(
2977 manifest_url('manifest.mpd'),
2978 video_id, mpd_id='dash', fatal=False))
2979 if re.search(r'(?:/smil:|\.smil)', url_base):
2980 if 'smil' not in skip_protocols:
2981 rtmp_formats = self._extract_smil_formats(
2982 manifest_url('jwplayer.smil'),
2983 video_id, fatal=False)
2984 for rtmp_format in rtmp_formats:
2985 rtsp_format = rtmp_format.copy()
2986 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2987 del rtsp_format['play_path']
2988 del rtsp_format['ext']
2989 rtsp_format.update({
2990 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2991 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2992 'protocol': 'rtsp',
2993 })
2994 formats.extend([rtmp_format, rtsp_format])
2995 else:
2996 for protocol in ('rtmp', 'rtsp'):
2997 if protocol not in skip_protocols:
2998 formats.append({
2999 'url': '%s:%s' % (protocol, url_base),
3000 'format_id': protocol,
3001 'protocol': protocol,
3002 })
3003 return formats
3004
3005 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3006 mobj = re.search(
3007 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3008 webpage)
3009 if mobj:
3010 try:
3011 jwplayer_data = self._parse_json(mobj.group('options'),
3012 video_id=video_id,
3013 transform_source=transform_source)
3014 except ExtractorError:
3015 pass
3016 else:
3017 if isinstance(jwplayer_data, dict):
3018 return jwplayer_data
3019
3020 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3021 jwplayer_data = self._find_jwplayer_data(
3022 webpage, video_id, transform_source=js_to_json)
3023 return self._parse_jwplayer_data(
3024 jwplayer_data, video_id, *args, **kwargs)
3025
3026 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3027 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3028 # JWPlayer backward compatibility: flattened playlists
3029 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3030 if 'playlist' not in jwplayer_data:
3031 jwplayer_data = {'playlist': [jwplayer_data]}
3032
3033 entries = []
3034
3035 # JWPlayer backward compatibility: single playlist item
3036 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3037 if not isinstance(jwplayer_data['playlist'], list):
3038 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3039
3040 for video_data in jwplayer_data['playlist']:
3041 # JWPlayer backward compatibility: flattened sources
3042 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3043 if 'sources' not in video_data:
3044 video_data['sources'] = [video_data]
3045
3046 this_video_id = video_id or video_data['mediaid']
3047
3048 formats = self._parse_jwplayer_formats(
3049 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3050 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3051
3052 subtitles = {}
3053 tracks = video_data.get('tracks')
3054 if tracks and isinstance(tracks, list):
3055 for track in tracks:
3056 if not isinstance(track, dict):
3057 continue
3058 track_kind = track.get('kind')
3059 if not track_kind or not isinstance(track_kind, compat_str):
3060 continue
3061 if track_kind.lower() not in ('captions', 'subtitles'):
3062 continue
3063 track_url = urljoin(base_url, track.get('file'))
3064 if not track_url:
3065 continue
3066 subtitles.setdefault(track.get('label') or 'en', []).append({
3067 'url': self._proto_relative_url(track_url)
3068 })
3069
3070 entry = {
3071 'id': this_video_id,
3072 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3073 'description': clean_html(video_data.get('description')),
3074 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3075 'timestamp': int_or_none(video_data.get('pubdate')),
3076 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3077 'subtitles': subtitles,
3078 }
3079 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3080 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3081 entry.update({
3082 '_type': 'url_transparent',
3083 'url': formats[0]['url'],
3084 })
3085 else:
3086 self._sort_formats(formats)
3087 entry['formats'] = formats
3088 entries.append(entry)
3089 if len(entries) == 1:
3090 return entries[0]
3091 else:
3092 return self.playlist_result(entries)
3093
3094 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3095 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3096 urls = []
3097 formats = []
3098 for source in jwplayer_sources_data:
3099 if not isinstance(source, dict):
3100 continue
3101 source_url = urljoin(
3102 base_url, self._proto_relative_url(source.get('file')))
3103 if not source_url or source_url in urls:
3104 continue
3105 urls.append(source_url)
3106 source_type = source.get('type') or ''
3107 ext = mimetype2ext(source_type) or determine_ext(source_url)
3108 if source_type == 'hls' or ext == 'm3u8':
3109 formats.extend(self._extract_m3u8_formats(
3110 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3111 m3u8_id=m3u8_id, fatal=False))
3112 elif source_type == 'dash' or ext == 'mpd':
3113 formats.extend(self._extract_mpd_formats(
3114 source_url, video_id, mpd_id=mpd_id, fatal=False))
3115 elif ext == 'smil':
3116 formats.extend(self._extract_smil_formats(
3117 source_url, video_id, fatal=False))
3118 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3119 elif source_type.startswith('audio') or ext in (
3120 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3121 formats.append({
3122 'url': source_url,
3123 'vcodec': 'none',
3124 'ext': ext,
3125 })
3126 else:
3127 height = int_or_none(source.get('height'))
3128 if height is None:
3129 # Often no height is provided but there is a label in
3130 # format like "1080p", "720p SD", or 1080.
3131 height = int_or_none(self._search_regex(
3132 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3133 'height', default=None))
3134 a_format = {
3135 'url': source_url,
3136 'width': int_or_none(source.get('width')),
3137 'height': height,
3138 'tbr': int_or_none(source.get('bitrate')),
3139 'ext': ext,
3140 }
3141 if source_url.startswith('rtmp'):
3142 a_format['ext'] = 'flv'
3143 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3144 # of jwplayer.flash.swf
3145 rtmp_url_parts = re.split(
3146 r'((?:mp4|mp3|flv):)', source_url, 1)
3147 if len(rtmp_url_parts) == 3:
3148 rtmp_url, prefix, play_path = rtmp_url_parts
3149 a_format.update({
3150 'url': rtmp_url,
3151 'play_path': prefix + play_path,
3152 })
3153 if rtmp_params:
3154 a_format.update(rtmp_params)
3155 formats.append(a_format)
3156 return formats
3157
3158 def _live_title(self, name):
3159 """ Generate the title for a live video """
3160 now = datetime.datetime.now()
3161 now_str = now.strftime('%Y-%m-%d %H:%M')
3162 return name + ' ' + now_str
3163
3164 def _int(self, v, name, fatal=False, **kwargs):
3165 res = int_or_none(v, **kwargs)
3166 if 'get_attr' in kwargs:
3167 print(getattr(v, kwargs['get_attr']))
3168 if res is None:
3169 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3170 if fatal:
3171 raise ExtractorError(msg)
3172 else:
3173 self._downloader.report_warning(msg)
3174 return res
3175
3176 def _float(self, v, name, fatal=False, **kwargs):
3177 res = float_or_none(v, **kwargs)
3178 if res is None:
3179 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3180 if fatal:
3181 raise ExtractorError(msg)
3182 else:
3183 self._downloader.report_warning(msg)
3184 return res
3185
3186 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3187 path='/', secure=False, discard=False, rest={}, **kwargs):
3188 cookie = compat_cookiejar_Cookie(
3189 0, name, value, port, port is not None, domain, True,
3190 domain.startswith('.'), path, True, secure, expire_time,
3191 discard, None, None, rest)
3192 self._downloader.cookiejar.set_cookie(cookie)
3193
3194 def _get_cookies(self, url):
3195 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
3196 req = sanitized_Request(url)
3197 self._downloader.cookiejar.add_cookie_header(req)
3198 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3199
3200 def _apply_first_set_cookie_header(self, url_handle, cookie):
3201 """
3202 Apply first Set-Cookie header instead of the last. Experimental.
3203
3204 Some sites (e.g. [1-3]) may serve two cookies under the same name
3205 in Set-Cookie header and expect the first (old) one to be set rather
3206 than second (new). However, as of RFC6265 the newer one cookie
3207 should be set into cookie store what actually happens.
3208 We will workaround this issue by resetting the cookie to
3209 the first one manually.
3210 1. https://new.vk.com/
3211 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3212 3. https://learning.oreilly.com/
3213 """
3214 for header, cookies in url_handle.headers.items():
3215 if header.lower() != 'set-cookie':
3216 continue
3217 if sys.version_info[0] >= 3:
3218 cookies = cookies.encode('iso-8859-1')
3219 cookies = cookies.decode('utf-8')
3220 cookie_value = re.search(
3221 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3222 if cookie_value:
3223 value, domain = cookie_value.groups()
3224 self._set_cookie(domain, cookie, value)
3225 break
3226
3227 def get_testcases(self, include_onlymatching=False):
3228 t = getattr(self, '_TEST', None)
3229 if t:
3230 assert not hasattr(self, '_TESTS'), \
3231 '%s has _TEST and _TESTS' % type(self).__name__
3232 tests = [t]
3233 else:
3234 tests = getattr(self, '_TESTS', [])
3235 for t in tests:
3236 if not include_onlymatching and t.get('only_matching', False):
3237 continue
3238 t['name'] = type(self).__name__[:-len('IE')]
3239 yield t
3240
3241 def is_suitable(self, age_limit):
3242 """ Test whether the extractor is generally suitable for the given
3243 age limit (i.e. pornographic sites are not, all others usually are) """
3244
3245 any_restricted = False
3246 for tc in self.get_testcases(include_onlymatching=False):
3247 if tc.get('playlist', []):
3248 tc = tc['playlist'][0]
3249 is_restricted = age_restricted(
3250 tc.get('info_dict', {}).get('age_limit'), age_limit)
3251 if not is_restricted:
3252 return True
3253 any_restricted = any_restricted or is_restricted
3254 return not any_restricted
3255
3256 def extract_subtitles(self, *args, **kwargs):
3257 if (self._downloader.params.get('writesubtitles', False)
3258 or self._downloader.params.get('listsubtitles')):
3259 return self._get_subtitles(*args, **kwargs)
3260 return {}
3261
3262 def _get_subtitles(self, *args, **kwargs):
3263 raise NotImplementedError('This method must be implemented by subclasses')
3264
3265 @staticmethod
3266 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3267 """ Merge subtitle items for one language. Items with duplicated URLs
3268 will be dropped. """
3269 list1_urls = set([item['url'] for item in subtitle_list1])
3270 ret = list(subtitle_list1)
3271 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3272 return ret
3273
3274 @classmethod
3275 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3276 """ Merge two subtitle dictionaries, language by language. """
3277 ret = dict(subtitle_dict1)
3278 for lang in subtitle_dict2:
3279 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3280 return ret
3281
3282 def extract_automatic_captions(self, *args, **kwargs):
3283 if (self._downloader.params.get('writeautomaticsub', False)
3284 or self._downloader.params.get('listsubtitles')):
3285 return self._get_automatic_captions(*args, **kwargs)
3286 return {}
3287
3288 def _get_automatic_captions(self, *args, **kwargs):
3289 raise NotImplementedError('This method must be implemented by subclasses')
3290
3291 def mark_watched(self, *args, **kwargs):
3292 if (self._downloader.params.get('mark_watched', False)
3293 and (self._get_login_info()[0] is not None
3294 or self._downloader.params.get('cookiefile') is not None)):
3295 self._mark_watched(*args, **kwargs)
3296
3297 def _mark_watched(self, *args, **kwargs):
3298 raise NotImplementedError('This method must be implemented by subclasses')
3299
3300 def geo_verification_headers(self):
3301 headers = {}
3302 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3303 if geo_verification_proxy:
3304 headers['Ytdl-request-proxy'] = geo_verification_proxy
3305 return headers
3306
3307 def _generic_id(self, url):
3308 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3309
3310 def _generic_title(self, url):
3311 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3312
3313
3314 class SearchInfoExtractor(InfoExtractor):
3315 """
3316 Base class for paged search queries extractors.
3317 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3318 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3319 """
3320
3321 @classmethod
3322 def _make_valid_url(cls):
3323 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3324
3325 @classmethod
3326 def suitable(cls, url):
3327 return re.match(cls._make_valid_url(), url) is not None
3328
3329 def _real_extract(self, query):
3330 mobj = re.match(self._make_valid_url(), query)
3331 if mobj is None:
3332 raise ExtractorError('Invalid search query "%s"' % query)
3333
3334 prefix = mobj.group('prefix')
3335 query = mobj.group('query')
3336 if prefix == '':
3337 return self._get_n_results(query, 1)
3338 elif prefix == 'all':
3339 return self._get_n_results(query, self._MAX_RESULTS)
3340 else:
3341 n = int(prefix)
3342 if n <= 0:
3343 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3344 elif n > self._MAX_RESULTS:
3345 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3346 n = self._MAX_RESULTS
3347 return self._get_n_results(query, n)
3348
3349 def _get_n_results(self, query, n):
3350 """Get a specified number of results for a query"""
3351 raise NotImplementedError('This method must be implemented by subclasses')
3352
3353 @property
3354 def SEARCH_KEY(self):
3355 return self._SEARCH_KEY