]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Start moving to ytdl-org
[yt-dlp.git] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar,
19 compat_cookies,
20 compat_etree_Element,
21 compat_etree_fromstring,
22 compat_getpass,
23 compat_integer_types,
24 compat_http_client,
25 compat_os_name,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
31 compat_urlparse,
32 compat_xml_parse_error,
33 )
34 from ..downloader.f4m import (
35 get_base_url,
36 remove_encrypted_media,
37 )
38 from ..utils import (
39 NO_DEFAULT,
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 clean_html,
44 compiled_regex_type,
45 determine_ext,
46 determine_protocol,
47 error_to_compat_str,
48 ExtractorError,
49 extract_attributes,
50 fix_xml_ampersands,
51 float_or_none,
52 GeoRestrictedError,
53 GeoUtils,
54 int_or_none,
55 js_to_json,
56 JSON_LD_RE,
57 mimetype2ext,
58 orderedSet,
59 parse_codecs,
60 parse_duration,
61 parse_iso8601,
62 parse_m3u8_attributes,
63 RegexNotFoundError,
64 sanitized_Request,
65 sanitize_filename,
66 unescapeHTML,
67 unified_strdate,
68 unified_timestamp,
69 update_Request,
70 update_url_query,
71 urljoin,
72 url_basename,
73 url_or_none,
74 xpath_element,
75 xpath_text,
76 xpath_with_ns,
77 )
78
79
80 class InfoExtractor(object):
81 """Information Extractor class.
82
83 Information extractors are the classes that, given a URL, extract
84 information about the video (or videos) the URL refers to. This
85 information includes the real video URL, the video title, author and
86 others. The information is stored in a dictionary which is then
87 passed to the YoutubeDL. The YoutubeDL processes this
88 information possibly downloading the video to the file system, among
89 other possible outcomes.
90
91 The type field determines the type of the result.
92 By far the most common value (and the default if _type is missing) is
93 "video", which indicates a single video.
94
95 For a video, the dictionaries must include the following fields:
96
97 id: Video identifier.
98 title: Video title, unescaped.
99
100 Additionally, it must contain either a formats entry or a url one:
101
102 formats: A list of dictionaries for each format available, ordered
103 from worst to best quality.
104
105 Potential fields:
106 * url The mandatory URL representing the media:
107 for plain file media - HTTP URL of this file,
108 for RTMP - RTMP URL,
109 for HLS - URL of the M3U8 media playlist,
110 for HDS - URL of the F4M manifest,
111 for DASH - URL of the MPD manifest or
112 base URL representing the media
113 if MPD manifest is parsed from
114 a string,
115 for MSS - URL of the ISM manifest.
116 * manifest_url
117 The URL of the manifest file in case of
118 fragmented media:
119 for HLS - URL of the M3U8 master playlist,
120 for HDS - URL of the F4M manifest,
121 for DASH - URL of the MPD manifest,
122 for MSS - URL of the ISM manifest.
123 * ext Will be calculated from URL if missing
124 * format A human-readable description of the format
125 ("mp4 container with h264/opus").
126 Calculated from the format_id, width, height.
127 and format_note fields if missing.
128 * format_id A short description of the format
129 ("mp4_h264_opus" or "19").
130 Technically optional, but strongly recommended.
131 * format_note Additional info about the format
132 ("3D" or "DASH video")
133 * width Width of the video, if known
134 * height Height of the video, if known
135 * resolution Textual description of width and height
136 * tbr Average bitrate of audio and video in KBit/s
137 * abr Average audio bitrate in KBit/s
138 * acodec Name of the audio codec in use
139 * asr Audio sampling rate in Hertz
140 * vbr Average video bitrate in KBit/s
141 * fps Frame rate
142 * vcodec Name of the video codec in use
143 * container Name of the container format
144 * filesize The number of bytes, if known in advance
145 * filesize_approx An estimate for the number of bytes
146 * player_url SWF Player URL (used for rtmpdump).
147 * protocol The protocol that will be used for the actual
148 download, lower-case.
149 "http", "https", "rtsp", "rtmp", "rtmpe",
150 "m3u8", "m3u8_native" or "http_dash_segments".
151 * fragment_base_url
152 Base URL for fragments. Each fragment's path
153 value (if present) will be relative to
154 this URL.
155 * fragments A list of fragments of a fragmented media.
156 Each fragment entry must contain either an url
157 or a path. If an url is present it should be
158 considered by a client. Otherwise both path and
159 fragment_base_url must be present. Here is
160 the list of all potential fields:
161 * "url" - fragment's URL
162 * "path" - fragment's path relative to
163 fragment_base_url
164 * "duration" (optional, int or float)
165 * "filesize" (optional, int)
166 * preference Order number of this format. If this field is
167 present and not None, the formats get sorted
168 by this field, regardless of all other values.
169 -1 for default (order by other properties),
170 -2 or smaller for less than default.
171 < -1000 to hide the format (if there is
172 another one which is strictly better)
173 * language Language code, e.g. "de" or "en-US".
174 * language_preference Is this in the language mentioned in
175 the URL?
176 10 if it's what the URL is about,
177 -1 for default (don't know),
178 -10 otherwise, other values reserved for now.
179 * quality Order number of the video quality of this
180 format, irrespective of the file format.
181 -1 for default (order by other properties),
182 -2 or smaller for less than default.
183 * source_preference Order number for this video source
184 (quality takes higher priority)
185 -1 for default (order by other properties),
186 -2 or smaller for less than default.
187 * http_headers A dictionary of additional HTTP headers
188 to add to the request.
189 * stretched_ratio If given and not 1, indicates that the
190 video's pixels are not square.
191 width : height ratio as float.
192 * no_resume The server does not support resuming the
193 (HTTP or RTMP) download. Boolean.
194 * downloader_options A dictionary of downloader options as
195 described in FileDownloader
196
197 url: Final video URL.
198 ext: Video filename extension.
199 format: The video format, defaults to ext (used for --get-format)
200 player_url: SWF Player URL (used for rtmpdump).
201
202 The following fields are optional:
203
204 alt_title: A secondary title of the video.
205 display_id An alternative identifier for the video, not necessarily
206 unique, but available before title. Typically, id is
207 something like "4234987", title "Dancing naked mole rats",
208 and display_id "dancing-naked-mole-rats"
209 thumbnails: A list of dictionaries, with the following entries:
210 * "id" (optional, string) - Thumbnail format ID
211 * "url"
212 * "preference" (optional, int) - quality of the image
213 * "width" (optional, int)
214 * "height" (optional, int)
215 * "resolution" (optional, string "{width}x{height"},
216 deprecated)
217 * "filesize" (optional, int)
218 thumbnail: Full URL to a video thumbnail image.
219 description: Full video description.
220 uploader: Full name of the video uploader.
221 license: License name the video is licensed under.
222 creator: The creator of the video.
223 release_date: The date (YYYYMMDD) when the video was released.
224 timestamp: UNIX timestamp of the moment the video became available.
225 upload_date: Video upload date (YYYYMMDD).
226 If not explicitly set, calculated from timestamp.
227 uploader_id: Nickname or id of the video uploader.
228 uploader_url: Full URL to a personal webpage of the video uploader.
229 channel: Full name of the channel the video is uploaded on.
230 Note that channel fields may or may not repeat uploader
231 fields. This depends on a particular extractor.
232 channel_id: Id of the channel.
233 channel_url: Full URL to a channel webpage.
234 location: Physical location where the video was filmed.
235 subtitles: The available subtitles as a dictionary in the format
236 {tag: subformats}. "tag" is usually a language code, and
237 "subformats" is a list sorted from lower to higher
238 preference, each element is a dictionary with the "ext"
239 entry and one of:
240 * "data": The subtitles file contents
241 * "url": A URL pointing to the subtitles file
242 "ext" will be calculated from URL if missing
243 automatic_captions: Like 'subtitles', used by the YoutubeIE for
244 automatically generated captions
245 duration: Length of the video in seconds, as an integer or float.
246 view_count: How many users have watched the video on the platform.
247 like_count: Number of positive ratings of the video
248 dislike_count: Number of negative ratings of the video
249 repost_count: Number of reposts of the video
250 average_rating: Average rating give by users, the scale used depends on the webpage
251 comment_count: Number of comments on the video
252 comments: A list of comments, each with one or more of the following
253 properties (all but one of text or html optional):
254 * "author" - human-readable name of the comment author
255 * "author_id" - user ID of the comment author
256 * "id" - Comment ID
257 * "html" - Comment as HTML
258 * "text" - Plain text of the comment
259 * "timestamp" - UNIX timestamp of comment
260 * "parent" - ID of the comment this one is replying to.
261 Set to "root" to indicate that this is a
262 comment to the original video.
263 age_limit: Age restriction for the video, as an integer (years)
264 webpage_url: The URL to the video webpage, if given to youtube-dl it
265 should allow to get the same result again. (It will be set
266 by YoutubeDL if it's missing)
267 categories: A list of categories that the video falls in, for example
268 ["Sports", "Berlin"]
269 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
270 is_live: True, False, or None (=unknown). Whether this video is a
271 live stream that goes on instead of a fixed-length video.
272 start_time: Time in seconds where the reproduction should start, as
273 specified in the URL.
274 end_time: Time in seconds where the reproduction should end, as
275 specified in the URL.
276 chapters: A list of dictionaries, with the following entries:
277 * "start_time" - The start time of the chapter in seconds
278 * "end_time" - The end time of the chapter in seconds
279 * "title" (optional, string)
280
281 The following fields should only be used when the video belongs to some logical
282 chapter or section:
283
284 chapter: Name or title of the chapter the video belongs to.
285 chapter_number: Number of the chapter the video belongs to, as an integer.
286 chapter_id: Id of the chapter the video belongs to, as a unicode string.
287
288 The following fields should only be used when the video is an episode of some
289 series, programme or podcast:
290
291 series: Title of the series or programme the video episode belongs to.
292 season: Title of the season the video episode belongs to.
293 season_number: Number of the season the video episode belongs to, as an integer.
294 season_id: Id of the season the video episode belongs to, as a unicode string.
295 episode: Title of the video episode. Unlike mandatory video title field,
296 this field should denote the exact title of the video episode
297 without any kind of decoration.
298 episode_number: Number of the video episode within a season, as an integer.
299 episode_id: Id of the video episode, as a unicode string.
300
301 The following fields should only be used when the media is a track or a part of
302 a music album:
303
304 track: Title of the track.
305 track_number: Number of the track within an album or a disc, as an integer.
306 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
307 as a unicode string.
308 artist: Artist(s) of the track.
309 genre: Genre(s) of the track.
310 album: Title of the album the track belongs to.
311 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
312 album_artist: List of all artists appeared on the album (e.g.
313 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
314 and compilations).
315 disc_number: Number of the disc or other physical medium the track belongs to,
316 as an integer.
317 release_year: Year (YYYY) when the album was released.
318
319 Unless mentioned otherwise, the fields should be Unicode strings.
320
321 Unless mentioned otherwise, None is equivalent to absence of information.
322
323
324 _type "playlist" indicates multiple videos.
325 There must be a key "entries", which is a list, an iterable, or a PagedList
326 object, each element of which is a valid dictionary by this specification.
327
328 Additionally, playlists can have "id", "title", "description", "uploader",
329 "uploader_id", "uploader_url" attributes with the same semantics as videos
330 (see above).
331
332
333 _type "multi_video" indicates that there are multiple videos that
334 form a single show, for examples multiple acts of an opera or TV episode.
335 It must have an entries key like a playlist and contain all the keys
336 required for a video at the same time.
337
338
339 _type "url" indicates that the video must be extracted from another
340 location, possibly by a different extractor. Its only required key is:
341 "url" - the next URL to extract.
342 The key "ie_key" can be set to the class name (minus the trailing "IE",
343 e.g. "Youtube") if the extractor class is known in advance.
344 Additionally, the dictionary may have any properties of the resolved entity
345 known in advance, for example "title" if the title of the referred video is
346 known ahead of time.
347
348
349 _type "url_transparent" entities have the same specification as "url", but
350 indicate that the given additional information is more precise than the one
351 associated with the resolved URL.
352 This is useful when a site employs a video service that hosts the video and
353 its technical metadata, but that video service does not embed a useful
354 title, description etc.
355
356
357 Subclasses of this one should re-define the _real_initialize() and
358 _real_extract() methods and define a _VALID_URL regexp.
359 Probably, they should also be added to the list of extractors.
360
361 _GEO_BYPASS attribute may be set to False in order to disable
362 geo restriction bypass mechanisms for a particular extractor.
363 Though it won't disable explicit geo restriction bypass based on
364 country code provided with geo_bypass_country.
365
366 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
367 countries for this extractor. One of these countries will be used by
368 geo restriction bypass mechanism right away in order to bypass
369 geo restriction, of course, if the mechanism is not disabled.
370
371 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
372 IP blocks in CIDR notation for this extractor. One of these IP blocks
373 will be used by geo restriction bypass mechanism similarly
374 to _GEO_COUNTRIES.
375
376 Finally, the _WORKING attribute should be set to False for broken IEs
377 in order to warn the users and skip the tests.
378 """
379
380 _ready = False
381 _downloader = None
382 _x_forwarded_for_ip = None
383 _GEO_BYPASS = True
384 _GEO_COUNTRIES = None
385 _GEO_IP_BLOCKS = None
386 _WORKING = True
387
388 def __init__(self, downloader=None):
389 """Constructor. Receives an optional downloader."""
390 self._ready = False
391 self._x_forwarded_for_ip = None
392 self.set_downloader(downloader)
393
394 @classmethod
395 def suitable(cls, url):
396 """Receives a URL and returns True if suitable for this IE."""
397
398 # This does not use has/getattr intentionally - we want to know whether
399 # we have cached the regexp for *this* class, whereas getattr would also
400 # match the superclass
401 if '_VALID_URL_RE' not in cls.__dict__:
402 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
403 return cls._VALID_URL_RE.match(url) is not None
404
405 @classmethod
406 def _match_id(cls, url):
407 if '_VALID_URL_RE' not in cls.__dict__:
408 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
409 m = cls._VALID_URL_RE.match(url)
410 assert m
411 return compat_str(m.group('id'))
412
413 @classmethod
414 def working(cls):
415 """Getter method for _WORKING."""
416 return cls._WORKING
417
418 def initialize(self):
419 """Initializes an instance (authentication, etc)."""
420 self._initialize_geo_bypass({
421 'countries': self._GEO_COUNTRIES,
422 'ip_blocks': self._GEO_IP_BLOCKS,
423 })
424 if not self._ready:
425 self._real_initialize()
426 self._ready = True
427
428 def _initialize_geo_bypass(self, geo_bypass_context):
429 """
430 Initialize geo restriction bypass mechanism.
431
432 This method is used to initialize geo bypass mechanism based on faking
433 X-Forwarded-For HTTP header. A random country from provided country list
434 is selected and a random IP belonging to this country is generated. This
435 IP will be passed as X-Forwarded-For HTTP header in all subsequent
436 HTTP requests.
437
438 This method will be used for initial geo bypass mechanism initialization
439 during the instance initialization with _GEO_COUNTRIES and
440 _GEO_IP_BLOCKS.
441
442 You may also manually call it from extractor's code if geo bypass
443 information is not available beforehand (e.g. obtained during
444 extraction) or due to some other reason. In this case you should pass
445 this information in geo bypass context passed as first argument. It may
446 contain following fields:
447
448 countries: List of geo unrestricted countries (similar
449 to _GEO_COUNTRIES)
450 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
451 (similar to _GEO_IP_BLOCKS)
452
453 """
454 if not self._x_forwarded_for_ip:
455
456 # Geo bypass mechanism is explicitly disabled by user
457 if not self._downloader.params.get('geo_bypass', True):
458 return
459
460 if not geo_bypass_context:
461 geo_bypass_context = {}
462
463 # Backward compatibility: previously _initialize_geo_bypass
464 # expected a list of countries, some 3rd party code may still use
465 # it this way
466 if isinstance(geo_bypass_context, (list, tuple)):
467 geo_bypass_context = {
468 'countries': geo_bypass_context,
469 }
470
471 # The whole point of geo bypass mechanism is to fake IP
472 # as X-Forwarded-For HTTP header based on some IP block or
473 # country code.
474
475 # Path 1: bypassing based on IP block in CIDR notation
476
477 # Explicit IP block specified by user, use it right away
478 # regardless of whether extractor is geo bypassable or not
479 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
480
481 # Otherwise use random IP block from geo bypass context but only
482 # if extractor is known as geo bypassable
483 if not ip_block:
484 ip_blocks = geo_bypass_context.get('ip_blocks')
485 if self._GEO_BYPASS and ip_blocks:
486 ip_block = random.choice(ip_blocks)
487
488 if ip_block:
489 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
490 if self._downloader.params.get('verbose', False):
491 self._downloader.to_screen(
492 '[debug] Using fake IP %s as X-Forwarded-For.'
493 % self._x_forwarded_for_ip)
494 return
495
496 # Path 2: bypassing based on country code
497
498 # Explicit country code specified by user, use it right away
499 # regardless of whether extractor is geo bypassable or not
500 country = self._downloader.params.get('geo_bypass_country', None)
501
502 # Otherwise use random country code from geo bypass context but
503 # only if extractor is known as geo bypassable
504 if not country:
505 countries = geo_bypass_context.get('countries')
506 if self._GEO_BYPASS and countries:
507 country = random.choice(countries)
508
509 if country:
510 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
511 if self._downloader.params.get('verbose', False):
512 self._downloader.to_screen(
513 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
514 % (self._x_forwarded_for_ip, country.upper()))
515
516 def extract(self, url):
517 """Extracts URL information and returns it in list of dicts."""
518 try:
519 for _ in range(2):
520 try:
521 self.initialize()
522 ie_result = self._real_extract(url)
523 if self._x_forwarded_for_ip:
524 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
525 return ie_result
526 except GeoRestrictedError as e:
527 if self.__maybe_fake_ip_and_retry(e.countries):
528 continue
529 raise
530 except ExtractorError:
531 raise
532 except compat_http_client.IncompleteRead as e:
533 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
534 except (KeyError, StopIteration) as e:
535 raise ExtractorError('An extractor error has occurred.', cause=e)
536
537 def __maybe_fake_ip_and_retry(self, countries):
538 if (not self._downloader.params.get('geo_bypass_country', None) and
539 self._GEO_BYPASS and
540 self._downloader.params.get('geo_bypass', True) and
541 not self._x_forwarded_for_ip and
542 countries):
543 country_code = random.choice(countries)
544 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
545 if self._x_forwarded_for_ip:
546 self.report_warning(
547 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
548 % (self._x_forwarded_for_ip, country_code.upper()))
549 return True
550 return False
551
552 def set_downloader(self, downloader):
553 """Sets the downloader for this IE."""
554 self._downloader = downloader
555
556 def _real_initialize(self):
557 """Real initialization process. Redefine in subclasses."""
558 pass
559
560 def _real_extract(self, url):
561 """Real extraction process. Redefine in subclasses."""
562 pass
563
564 @classmethod
565 def ie_key(cls):
566 """A string for getting the InfoExtractor with get_info_extractor"""
567 return compat_str(cls.__name__[:-2])
568
569 @property
570 def IE_NAME(self):
571 return compat_str(type(self).__name__[:-2])
572
573 @staticmethod
574 def __can_accept_status_code(err, expected_status):
575 assert isinstance(err, compat_urllib_error.HTTPError)
576 if expected_status is None:
577 return False
578 if isinstance(expected_status, compat_integer_types):
579 return err.code == expected_status
580 elif isinstance(expected_status, (list, tuple)):
581 return err.code in expected_status
582 elif callable(expected_status):
583 return expected_status(err.code) is True
584 else:
585 assert False
586
587 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
588 """
589 Return the response handle.
590
591 See _download_webpage docstring for arguments specification.
592 """
593 if note is None:
594 self.report_download_webpage(video_id)
595 elif note is not False:
596 if video_id is None:
597 self.to_screen('%s' % (note,))
598 else:
599 self.to_screen('%s: %s' % (video_id, note))
600
601 # Some sites check X-Forwarded-For HTTP header in order to figure out
602 # the origin of the client behind proxy. This allows bypassing geo
603 # restriction by faking this header's value to IP that belongs to some
604 # geo unrestricted country. We will do so once we encounter any
605 # geo restriction error.
606 if self._x_forwarded_for_ip:
607 if 'X-Forwarded-For' not in headers:
608 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
609
610 if isinstance(url_or_request, compat_urllib_request.Request):
611 url_or_request = update_Request(
612 url_or_request, data=data, headers=headers, query=query)
613 else:
614 if query:
615 url_or_request = update_url_query(url_or_request, query)
616 if data is not None or headers:
617 url_or_request = sanitized_Request(url_or_request, data, headers)
618 try:
619 return self._downloader.urlopen(url_or_request)
620 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
621 if isinstance(err, compat_urllib_error.HTTPError):
622 if self.__can_accept_status_code(err, expected_status):
623 # Retain reference to error to prevent file object from
624 # being closed before it can be read. Works around the
625 # effects of <https://bugs.python.org/issue15002>
626 # introduced in Python 3.4.1.
627 err.fp._error = err
628 return err.fp
629
630 if errnote is False:
631 return False
632 if errnote is None:
633 errnote = 'Unable to download webpage'
634
635 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
636 if fatal:
637 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
638 else:
639 self._downloader.report_warning(errmsg)
640 return False
641
642 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
643 """
644 Return a tuple (page content as string, URL handle).
645
646 See _download_webpage docstring for arguments specification.
647 """
648 # Strip hashes from the URL (#1038)
649 if isinstance(url_or_request, (compat_str, str)):
650 url_or_request = url_or_request.partition('#')[0]
651
652 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
653 if urlh is False:
654 assert not fatal
655 return False
656 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
657 return (content, urlh)
658
659 @staticmethod
660 def _guess_encoding_from_content(content_type, webpage_bytes):
661 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
662 if m:
663 encoding = m.group(1)
664 else:
665 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
666 webpage_bytes[:1024])
667 if m:
668 encoding = m.group(1).decode('ascii')
669 elif webpage_bytes.startswith(b'\xff\xfe'):
670 encoding = 'utf-16'
671 else:
672 encoding = 'utf-8'
673
674 return encoding
675
676 def __check_blocked(self, content):
677 first_block = content[:512]
678 if ('<title>Access to this site is blocked</title>' in content and
679 'Websense' in first_block):
680 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
681 blocked_iframe = self._html_search_regex(
682 r'<iframe src="([^"]+)"', content,
683 'Websense information URL', default=None)
684 if blocked_iframe:
685 msg += ' Visit %s for more details' % blocked_iframe
686 raise ExtractorError(msg, expected=True)
687 if '<title>The URL you requested has been blocked</title>' in first_block:
688 msg = (
689 'Access to this webpage has been blocked by Indian censorship. '
690 'Use a VPN or proxy server (with --proxy) to route around it.')
691 block_msg = self._html_search_regex(
692 r'</h1><p>(.*?)</p>',
693 content, 'block message', default=None)
694 if block_msg:
695 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
696 raise ExtractorError(msg, expected=True)
697 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
698 'blocklist.rkn.gov.ru' in content):
699 raise ExtractorError(
700 'Access to this webpage has been blocked by decision of the Russian government. '
701 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
702 expected=True)
703
704 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
705 content_type = urlh.headers.get('Content-Type', '')
706 webpage_bytes = urlh.read()
707 if prefix is not None:
708 webpage_bytes = prefix + webpage_bytes
709 if not encoding:
710 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
711 if self._downloader.params.get('dump_intermediate_pages', False):
712 self.to_screen('Dumping request to ' + urlh.geturl())
713 dump = base64.b64encode(webpage_bytes).decode('ascii')
714 self._downloader.to_screen(dump)
715 if self._downloader.params.get('write_pages', False):
716 basen = '%s_%s' % (video_id, urlh.geturl())
717 if len(basen) > 240:
718 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
719 basen = basen[:240 - len(h)] + h
720 raw_filename = basen + '.dump'
721 filename = sanitize_filename(raw_filename, restricted=True)
722 self.to_screen('Saving request to ' + filename)
723 # Working around MAX_PATH limitation on Windows (see
724 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
725 if compat_os_name == 'nt':
726 absfilepath = os.path.abspath(filename)
727 if len(absfilepath) > 259:
728 filename = '\\\\?\\' + absfilepath
729 with open(filename, 'wb') as outf:
730 outf.write(webpage_bytes)
731
732 try:
733 content = webpage_bytes.decode(encoding, 'replace')
734 except LookupError:
735 content = webpage_bytes.decode('utf-8', 'replace')
736
737 self.__check_blocked(content)
738
739 return content
740
741 def _download_webpage(
742 self, url_or_request, video_id, note=None, errnote=None,
743 fatal=True, tries=1, timeout=5, encoding=None, data=None,
744 headers={}, query={}, expected_status=None):
745 """
746 Return the data of the page as a string.
747
748 Arguments:
749 url_or_request -- plain text URL as a string or
750 a compat_urllib_request.Requestobject
751 video_id -- Video/playlist/item identifier (string)
752
753 Keyword arguments:
754 note -- note printed before downloading (string)
755 errnote -- note printed in case of an error (string)
756 fatal -- flag denoting whether error should be considered fatal,
757 i.e. whether it should cause ExtractionError to be raised,
758 otherwise a warning will be reported and extraction continued
759 tries -- number of tries
760 timeout -- sleep interval between tries
761 encoding -- encoding for a page content decoding, guessed automatically
762 when not explicitly specified
763 data -- POST data (bytes)
764 headers -- HTTP headers (dict)
765 query -- URL query (dict)
766 expected_status -- allows to accept failed HTTP requests (non 2xx
767 status code) by explicitly specifying a set of accepted status
768 codes. Can be any of the following entities:
769 - an integer type specifying an exact failed status code to
770 accept
771 - a list or a tuple of integer types specifying a list of
772 failed status codes to accept
773 - a callable accepting an actual failed status code and
774 returning True if it should be accepted
775 Note that this argument does not affect success status codes (2xx)
776 which are always accepted.
777 """
778
779 success = False
780 try_count = 0
781 while success is False:
782 try:
783 res = self._download_webpage_handle(
784 url_or_request, video_id, note, errnote, fatal,
785 encoding=encoding, data=data, headers=headers, query=query,
786 expected_status=expected_status)
787 success = True
788 except compat_http_client.IncompleteRead as e:
789 try_count += 1
790 if try_count >= tries:
791 raise e
792 self._sleep(timeout, video_id)
793 if res is False:
794 return res
795 else:
796 content, _ = res
797 return content
798
799 def _download_xml_handle(
800 self, url_or_request, video_id, note='Downloading XML',
801 errnote='Unable to download XML', transform_source=None,
802 fatal=True, encoding=None, data=None, headers={}, query={},
803 expected_status=None):
804 """
805 Return a tuple (xml as an compat_etree_Element, URL handle).
806
807 See _download_webpage docstring for arguments specification.
808 """
809 res = self._download_webpage_handle(
810 url_or_request, video_id, note, errnote, fatal=fatal,
811 encoding=encoding, data=data, headers=headers, query=query,
812 expected_status=expected_status)
813 if res is False:
814 return res
815 xml_string, urlh = res
816 return self._parse_xml(
817 xml_string, video_id, transform_source=transform_source,
818 fatal=fatal), urlh
819
820 def _download_xml(
821 self, url_or_request, video_id,
822 note='Downloading XML', errnote='Unable to download XML',
823 transform_source=None, fatal=True, encoding=None,
824 data=None, headers={}, query={}, expected_status=None):
825 """
826 Return the xml as an compat_etree_Element.
827
828 See _download_webpage docstring for arguments specification.
829 """
830 res = self._download_xml_handle(
831 url_or_request, video_id, note=note, errnote=errnote,
832 transform_source=transform_source, fatal=fatal, encoding=encoding,
833 data=data, headers=headers, query=query,
834 expected_status=expected_status)
835 return res if res is False else res[0]
836
837 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
838 if transform_source:
839 xml_string = transform_source(xml_string)
840 try:
841 return compat_etree_fromstring(xml_string.encode('utf-8'))
842 except compat_xml_parse_error as ve:
843 errmsg = '%s: Failed to parse XML ' % video_id
844 if fatal:
845 raise ExtractorError(errmsg, cause=ve)
846 else:
847 self.report_warning(errmsg + str(ve))
848
849 def _download_json_handle(
850 self, url_or_request, video_id, note='Downloading JSON metadata',
851 errnote='Unable to download JSON metadata', transform_source=None,
852 fatal=True, encoding=None, data=None, headers={}, query={},
853 expected_status=None):
854 """
855 Return a tuple (JSON object, URL handle).
856
857 See _download_webpage docstring for arguments specification.
858 """
859 res = self._download_webpage_handle(
860 url_or_request, video_id, note, errnote, fatal=fatal,
861 encoding=encoding, data=data, headers=headers, query=query,
862 expected_status=expected_status)
863 if res is False:
864 return res
865 json_string, urlh = res
866 return self._parse_json(
867 json_string, video_id, transform_source=transform_source,
868 fatal=fatal), urlh
869
870 def _download_json(
871 self, url_or_request, video_id, note='Downloading JSON metadata',
872 errnote='Unable to download JSON metadata', transform_source=None,
873 fatal=True, encoding=None, data=None, headers={}, query={},
874 expected_status=None):
875 """
876 Return the JSON object as a dict.
877
878 See _download_webpage docstring for arguments specification.
879 """
880 res = self._download_json_handle(
881 url_or_request, video_id, note=note, errnote=errnote,
882 transform_source=transform_source, fatal=fatal, encoding=encoding,
883 data=data, headers=headers, query=query,
884 expected_status=expected_status)
885 return res if res is False else res[0]
886
887 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
888 if transform_source:
889 json_string = transform_source(json_string)
890 try:
891 return json.loads(json_string)
892 except ValueError as ve:
893 errmsg = '%s: Failed to parse JSON ' % video_id
894 if fatal:
895 raise ExtractorError(errmsg, cause=ve)
896 else:
897 self.report_warning(errmsg + str(ve))
898
899 def report_warning(self, msg, video_id=None):
900 idstr = '' if video_id is None else '%s: ' % video_id
901 self._downloader.report_warning(
902 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
903
904 def to_screen(self, msg):
905 """Print msg to screen, prefixing it with '[ie_name]'"""
906 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
907
908 def report_extraction(self, id_or_name):
909 """Report information extraction."""
910 self.to_screen('%s: Extracting information' % id_or_name)
911
912 def report_download_webpage(self, video_id):
913 """Report webpage download."""
914 self.to_screen('%s: Downloading webpage' % video_id)
915
916 def report_age_confirmation(self):
917 """Report attempt to confirm age."""
918 self.to_screen('Confirming age')
919
920 def report_login(self):
921 """Report attempt to log in."""
922 self.to_screen('Logging in')
923
924 @staticmethod
925 def raise_login_required(msg='This video is only available for registered users'):
926 raise ExtractorError(
927 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
928 expected=True)
929
930 @staticmethod
931 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
932 raise GeoRestrictedError(msg, countries=countries)
933
934 # Methods for following #608
935 @staticmethod
936 def url_result(url, ie=None, video_id=None, video_title=None):
937 """Returns a URL that points to a page that should be processed"""
938 # TODO: ie should be the class used for getting the info
939 video_info = {'_type': 'url',
940 'url': url,
941 'ie_key': ie}
942 if video_id is not None:
943 video_info['id'] = video_id
944 if video_title is not None:
945 video_info['title'] = video_title
946 return video_info
947
948 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
949 urls = orderedSet(
950 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
951 for m in matches)
952 return self.playlist_result(
953 urls, playlist_id=playlist_id, playlist_title=playlist_title)
954
955 @staticmethod
956 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
957 """Returns a playlist"""
958 video_info = {'_type': 'playlist',
959 'entries': entries}
960 if playlist_id:
961 video_info['id'] = playlist_id
962 if playlist_title:
963 video_info['title'] = playlist_title
964 if playlist_description:
965 video_info['description'] = playlist_description
966 return video_info
967
968 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
969 """
970 Perform a regex search on the given string, using a single or a list of
971 patterns returning the first matching group.
972 In case of failure return a default value or raise a WARNING or a
973 RegexNotFoundError, depending on fatal, specifying the field name.
974 """
975 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
976 mobj = re.search(pattern, string, flags)
977 else:
978 for p in pattern:
979 mobj = re.search(p, string, flags)
980 if mobj:
981 break
982
983 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
984 _name = '\033[0;34m%s\033[0m' % name
985 else:
986 _name = name
987
988 if mobj:
989 if group is None:
990 # return the first matching group
991 return next(g for g in mobj.groups() if g is not None)
992 else:
993 return mobj.group(group)
994 elif default is not NO_DEFAULT:
995 return default
996 elif fatal:
997 raise RegexNotFoundError('Unable to extract %s' % _name)
998 else:
999 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1000 return None
1001
1002 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1003 """
1004 Like _search_regex, but strips HTML tags and unescapes entities.
1005 """
1006 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1007 if res:
1008 return clean_html(res).strip()
1009 else:
1010 return res
1011
1012 def _get_netrc_login_info(self, netrc_machine=None):
1013 username = None
1014 password = None
1015 netrc_machine = netrc_machine or self._NETRC_MACHINE
1016
1017 if self._downloader.params.get('usenetrc', False):
1018 try:
1019 info = netrc.netrc().authenticators(netrc_machine)
1020 if info is not None:
1021 username = info[0]
1022 password = info[2]
1023 else:
1024 raise netrc.NetrcParseError(
1025 'No authenticators for %s' % netrc_machine)
1026 except (IOError, netrc.NetrcParseError) as err:
1027 self._downloader.report_warning(
1028 'parsing .netrc: %s' % error_to_compat_str(err))
1029
1030 return username, password
1031
1032 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1033 """
1034 Get the login info as (username, password)
1035 First look for the manually specified credentials using username_option
1036 and password_option as keys in params dictionary. If no such credentials
1037 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1038 value.
1039 If there's no info available, return (None, None)
1040 """
1041 if self._downloader is None:
1042 return (None, None)
1043
1044 downloader_params = self._downloader.params
1045
1046 # Attempt to use provided username and password or .netrc data
1047 if downloader_params.get(username_option) is not None:
1048 username = downloader_params[username_option]
1049 password = downloader_params[password_option]
1050 else:
1051 username, password = self._get_netrc_login_info(netrc_machine)
1052
1053 return username, password
1054
1055 def _get_tfa_info(self, note='two-factor verification code'):
1056 """
1057 Get the two-factor authentication info
1058 TODO - asking the user will be required for sms/phone verify
1059 currently just uses the command line option
1060 If there's no info available, return None
1061 """
1062 if self._downloader is None:
1063 return None
1064 downloader_params = self._downloader.params
1065
1066 if downloader_params.get('twofactor') is not None:
1067 return downloader_params['twofactor']
1068
1069 return compat_getpass('Type %s and press [Return]: ' % note)
1070
1071 # Helper functions for extracting OpenGraph info
1072 @staticmethod
1073 def _og_regexes(prop):
1074 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1075 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1076 % {'prop': re.escape(prop)})
1077 template = r'<meta[^>]+?%s[^>]+?%s'
1078 return [
1079 template % (property_re, content_re),
1080 template % (content_re, property_re),
1081 ]
1082
1083 @staticmethod
1084 def _meta_regex(prop):
1085 return r'''(?isx)<meta
1086 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1087 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1088
1089 def _og_search_property(self, prop, html, name=None, **kargs):
1090 if not isinstance(prop, (list, tuple)):
1091 prop = [prop]
1092 if name is None:
1093 name = 'OpenGraph %s' % prop[0]
1094 og_regexes = []
1095 for p in prop:
1096 og_regexes.extend(self._og_regexes(p))
1097 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1098 if escaped is None:
1099 return None
1100 return unescapeHTML(escaped)
1101
1102 def _og_search_thumbnail(self, html, **kargs):
1103 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1104
1105 def _og_search_description(self, html, **kargs):
1106 return self._og_search_property('description', html, fatal=False, **kargs)
1107
1108 def _og_search_title(self, html, **kargs):
1109 return self._og_search_property('title', html, **kargs)
1110
1111 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1112 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1113 if secure:
1114 regexes = self._og_regexes('video:secure_url') + regexes
1115 return self._html_search_regex(regexes, html, name, **kargs)
1116
1117 def _og_search_url(self, html, **kargs):
1118 return self._og_search_property('url', html, **kargs)
1119
1120 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1121 if not isinstance(name, (list, tuple)):
1122 name = [name]
1123 if display_name is None:
1124 display_name = name[0]
1125 return self._html_search_regex(
1126 [self._meta_regex(n) for n in name],
1127 html, display_name, fatal=fatal, group='content', **kwargs)
1128
1129 def _dc_search_uploader(self, html):
1130 return self._html_search_meta('dc.creator', html, 'uploader')
1131
1132 def _rta_search(self, html):
1133 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1134 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1135 r' content="RTA-5042-1996-1400-1577-RTA"',
1136 html):
1137 return 18
1138 return 0
1139
1140 def _media_rating_search(self, html):
1141 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1142 rating = self._html_search_meta('rating', html)
1143
1144 if not rating:
1145 return None
1146
1147 RATING_TABLE = {
1148 'safe for kids': 0,
1149 'general': 8,
1150 '14 years': 14,
1151 'mature': 17,
1152 'restricted': 19,
1153 }
1154 return RATING_TABLE.get(rating.lower())
1155
1156 def _family_friendly_search(self, html):
1157 # See http://schema.org/VideoObject
1158 family_friendly = self._html_search_meta(
1159 'isFamilyFriendly', html, default=None)
1160
1161 if not family_friendly:
1162 return None
1163
1164 RATING_TABLE = {
1165 '1': 0,
1166 'true': 0,
1167 '0': 18,
1168 'false': 18,
1169 }
1170 return RATING_TABLE.get(family_friendly.lower())
1171
1172 def _twitter_search_player(self, html):
1173 return self._html_search_meta('twitter:player', html,
1174 'twitter card player')
1175
1176 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1177 json_ld = self._search_regex(
1178 JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1179 default = kwargs.get('default', NO_DEFAULT)
1180 if not json_ld:
1181 return default if default is not NO_DEFAULT else {}
1182 # JSON-LD may be malformed and thus `fatal` should be respected.
1183 # At the same time `default` may be passed that assumes `fatal=False`
1184 # for _search_regex. Let's simulate the same behavior here as well.
1185 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1186 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1187
1188 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1189 if isinstance(json_ld, compat_str):
1190 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1191 if not json_ld:
1192 return {}
1193 info = {}
1194 if not isinstance(json_ld, (list, tuple, dict)):
1195 return info
1196 if isinstance(json_ld, dict):
1197 json_ld = [json_ld]
1198
1199 INTERACTION_TYPE_MAP = {
1200 'CommentAction': 'comment',
1201 'AgreeAction': 'like',
1202 'DisagreeAction': 'dislike',
1203 'LikeAction': 'like',
1204 'DislikeAction': 'dislike',
1205 'ListenAction': 'view',
1206 'WatchAction': 'view',
1207 'ViewAction': 'view',
1208 }
1209
1210 def extract_interaction_statistic(e):
1211 interaction_statistic = e.get('interactionStatistic')
1212 if not isinstance(interaction_statistic, list):
1213 return
1214 for is_e in interaction_statistic:
1215 if not isinstance(is_e, dict):
1216 continue
1217 if is_e.get('@type') != 'InteractionCounter':
1218 continue
1219 interaction_type = is_e.get('interactionType')
1220 if not isinstance(interaction_type, compat_str):
1221 continue
1222 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1223 if interaction_count is None:
1224 continue
1225 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1226 if not count_kind:
1227 continue
1228 count_key = '%s_count' % count_kind
1229 if info.get(count_key) is not None:
1230 continue
1231 info[count_key] = interaction_count
1232
1233 def extract_video_object(e):
1234 assert e['@type'] == 'VideoObject'
1235 info.update({
1236 'url': url_or_none(e.get('contentUrl')),
1237 'title': unescapeHTML(e.get('name')),
1238 'description': unescapeHTML(e.get('description')),
1239 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1240 'duration': parse_duration(e.get('duration')),
1241 'timestamp': unified_timestamp(e.get('uploadDate')),
1242 'filesize': float_or_none(e.get('contentSize')),
1243 'tbr': int_or_none(e.get('bitrate')),
1244 'width': int_or_none(e.get('width')),
1245 'height': int_or_none(e.get('height')),
1246 'view_count': int_or_none(e.get('interactionCount')),
1247 })
1248 extract_interaction_statistic(e)
1249
1250 for e in json_ld:
1251 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1252 item_type = e.get('@type')
1253 if expected_type is not None and expected_type != item_type:
1254 return info
1255 if item_type in ('TVEpisode', 'Episode'):
1256 episode_name = unescapeHTML(e.get('name'))
1257 info.update({
1258 'episode': episode_name,
1259 'episode_number': int_or_none(e.get('episodeNumber')),
1260 'description': unescapeHTML(e.get('description')),
1261 })
1262 if not info.get('title') and episode_name:
1263 info['title'] = episode_name
1264 part_of_season = e.get('partOfSeason')
1265 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1266 info.update({
1267 'season': unescapeHTML(part_of_season.get('name')),
1268 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1269 })
1270 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1271 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1272 info['series'] = unescapeHTML(part_of_series.get('name'))
1273 elif item_type == 'Movie':
1274 info.update({
1275 'title': unescapeHTML(e.get('name')),
1276 'description': unescapeHTML(e.get('description')),
1277 'duration': parse_duration(e.get('duration')),
1278 'timestamp': unified_timestamp(e.get('dateCreated')),
1279 })
1280 elif item_type in ('Article', 'NewsArticle'):
1281 info.update({
1282 'timestamp': parse_iso8601(e.get('datePublished')),
1283 'title': unescapeHTML(e.get('headline')),
1284 'description': unescapeHTML(e.get('articleBody')),
1285 })
1286 elif item_type == 'VideoObject':
1287 extract_video_object(e)
1288 continue
1289 video = e.get('video')
1290 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1291 extract_video_object(video)
1292 break
1293 return dict((k, v) for k, v in info.items() if v is not None)
1294
1295 @staticmethod
1296 def _hidden_inputs(html):
1297 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1298 hidden_inputs = {}
1299 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1300 attrs = extract_attributes(input)
1301 if not input:
1302 continue
1303 if attrs.get('type') not in ('hidden', 'submit'):
1304 continue
1305 name = attrs.get('name') or attrs.get('id')
1306 value = attrs.get('value')
1307 if name and value is not None:
1308 hidden_inputs[name] = value
1309 return hidden_inputs
1310
1311 def _form_hidden_inputs(self, form_id, html):
1312 form = self._search_regex(
1313 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1314 html, '%s form' % form_id, group='form')
1315 return self._hidden_inputs(form)
1316
1317 def _sort_formats(self, formats, field_preference=None):
1318 if not formats:
1319 raise ExtractorError('No video formats found')
1320
1321 for f in formats:
1322 # Automatically determine tbr when missing based on abr and vbr (improves
1323 # formats sorting in some cases)
1324 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1325 f['tbr'] = f['abr'] + f['vbr']
1326
1327 def _formats_key(f):
1328 # TODO remove the following workaround
1329 from ..utils import determine_ext
1330 if not f.get('ext') and 'url' in f:
1331 f['ext'] = determine_ext(f['url'])
1332
1333 if isinstance(field_preference, (list, tuple)):
1334 return tuple(
1335 f.get(field)
1336 if f.get(field) is not None
1337 else ('' if field == 'format_id' else -1)
1338 for field in field_preference)
1339
1340 preference = f.get('preference')
1341 if preference is None:
1342 preference = 0
1343 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1344 preference -= 0.5
1345
1346 protocol = f.get('protocol') or determine_protocol(f)
1347 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1348
1349 if f.get('vcodec') == 'none': # audio only
1350 preference -= 50
1351 if self._downloader.params.get('prefer_free_formats'):
1352 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1353 else:
1354 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1355 ext_preference = 0
1356 try:
1357 audio_ext_preference = ORDER.index(f['ext'])
1358 except ValueError:
1359 audio_ext_preference = -1
1360 else:
1361 if f.get('acodec') == 'none': # video only
1362 preference -= 40
1363 if self._downloader.params.get('prefer_free_formats'):
1364 ORDER = ['flv', 'mp4', 'webm']
1365 else:
1366 ORDER = ['webm', 'flv', 'mp4']
1367 try:
1368 ext_preference = ORDER.index(f['ext'])
1369 except ValueError:
1370 ext_preference = -1
1371 audio_ext_preference = 0
1372
1373 return (
1374 preference,
1375 f.get('language_preference') if f.get('language_preference') is not None else -1,
1376 f.get('quality') if f.get('quality') is not None else -1,
1377 f.get('tbr') if f.get('tbr') is not None else -1,
1378 f.get('filesize') if f.get('filesize') is not None else -1,
1379 f.get('vbr') if f.get('vbr') is not None else -1,
1380 f.get('height') if f.get('height') is not None else -1,
1381 f.get('width') if f.get('width') is not None else -1,
1382 proto_preference,
1383 ext_preference,
1384 f.get('abr') if f.get('abr') is not None else -1,
1385 audio_ext_preference,
1386 f.get('fps') if f.get('fps') is not None else -1,
1387 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1388 f.get('source_preference') if f.get('source_preference') is not None else -1,
1389 f.get('format_id') if f.get('format_id') is not None else '',
1390 )
1391 formats.sort(key=_formats_key)
1392
1393 def _check_formats(self, formats, video_id):
1394 if formats:
1395 formats[:] = filter(
1396 lambda f: self._is_valid_url(
1397 f['url'], video_id,
1398 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1399 formats)
1400
1401 @staticmethod
1402 def _remove_duplicate_formats(formats):
1403 format_urls = set()
1404 unique_formats = []
1405 for f in formats:
1406 if f['url'] not in format_urls:
1407 format_urls.add(f['url'])
1408 unique_formats.append(f)
1409 formats[:] = unique_formats
1410
1411 def _is_valid_url(self, url, video_id, item='video', headers={}):
1412 url = self._proto_relative_url(url, scheme='http:')
1413 # For now assume non HTTP(S) URLs always valid
1414 if not (url.startswith('http://') or url.startswith('https://')):
1415 return True
1416 try:
1417 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1418 return True
1419 except ExtractorError as e:
1420 if isinstance(e.cause, compat_urllib_error.URLError):
1421 self.to_screen(
1422 '%s: %s URL is invalid, skipping' % (video_id, item))
1423 return False
1424 raise
1425
1426 def http_scheme(self):
1427 """ Either "http:" or "https:", depending on the user's preferences """
1428 return (
1429 'http:'
1430 if self._downloader.params.get('prefer_insecure', False)
1431 else 'https:')
1432
1433 def _proto_relative_url(self, url, scheme=None):
1434 if url is None:
1435 return url
1436 if url.startswith('//'):
1437 if scheme is None:
1438 scheme = self.http_scheme()
1439 return scheme + url
1440 else:
1441 return url
1442
1443 def _sleep(self, timeout, video_id, msg_template=None):
1444 if msg_template is None:
1445 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1446 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1447 self.to_screen(msg)
1448 time.sleep(timeout)
1449
1450 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1451 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1452 fatal=True, m3u8_id=None):
1453 manifest = self._download_xml(
1454 manifest_url, video_id, 'Downloading f4m manifest',
1455 'Unable to download f4m manifest',
1456 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1457 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1458 transform_source=transform_source,
1459 fatal=fatal)
1460
1461 if manifest is False:
1462 return []
1463
1464 return self._parse_f4m_formats(
1465 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1466 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1467
1468 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1469 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1470 fatal=True, m3u8_id=None):
1471 if not isinstance(manifest, compat_etree_Element) and not fatal:
1472 return []
1473
1474 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1475 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1476 if akamai_pv is not None and ';' in akamai_pv.text:
1477 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1478 if playerVerificationChallenge.strip() != '':
1479 return []
1480
1481 formats = []
1482 manifest_version = '1.0'
1483 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1484 if not media_nodes:
1485 manifest_version = '2.0'
1486 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1487 # Remove unsupported DRM protected media from final formats
1488 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1489 media_nodes = remove_encrypted_media(media_nodes)
1490 if not media_nodes:
1491 return formats
1492
1493 manifest_base_url = get_base_url(manifest)
1494
1495 bootstrap_info = xpath_element(
1496 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1497 'bootstrap info', default=None)
1498
1499 vcodec = None
1500 mime_type = xpath_text(
1501 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1502 'base URL', default=None)
1503 if mime_type and mime_type.startswith('audio/'):
1504 vcodec = 'none'
1505
1506 for i, media_el in enumerate(media_nodes):
1507 tbr = int_or_none(media_el.attrib.get('bitrate'))
1508 width = int_or_none(media_el.attrib.get('width'))
1509 height = int_or_none(media_el.attrib.get('height'))
1510 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1511 # If <bootstrapInfo> is present, the specified f4m is a
1512 # stream-level manifest, and only set-level manifests may refer to
1513 # external resources. See section 11.4 and section 4 of F4M spec
1514 if bootstrap_info is None:
1515 media_url = None
1516 # @href is introduced in 2.0, see section 11.6 of F4M spec
1517 if manifest_version == '2.0':
1518 media_url = media_el.attrib.get('href')
1519 if media_url is None:
1520 media_url = media_el.attrib.get('url')
1521 if not media_url:
1522 continue
1523 manifest_url = (
1524 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1525 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1526 # If media_url is itself a f4m manifest do the recursive extraction
1527 # since bitrates in parent manifest (this one) and media_url manifest
1528 # may differ leading to inability to resolve the format by requested
1529 # bitrate in f4m downloader
1530 ext = determine_ext(manifest_url)
1531 if ext == 'f4m':
1532 f4m_formats = self._extract_f4m_formats(
1533 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1534 transform_source=transform_source, fatal=fatal)
1535 # Sometimes stream-level manifest contains single media entry that
1536 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1537 # At the same time parent's media entry in set-level manifest may
1538 # contain it. We will copy it from parent in such cases.
1539 if len(f4m_formats) == 1:
1540 f = f4m_formats[0]
1541 f.update({
1542 'tbr': f.get('tbr') or tbr,
1543 'width': f.get('width') or width,
1544 'height': f.get('height') or height,
1545 'format_id': f.get('format_id') if not tbr else format_id,
1546 'vcodec': vcodec,
1547 })
1548 formats.extend(f4m_formats)
1549 continue
1550 elif ext == 'm3u8':
1551 formats.extend(self._extract_m3u8_formats(
1552 manifest_url, video_id, 'mp4', preference=preference,
1553 m3u8_id=m3u8_id, fatal=fatal))
1554 continue
1555 formats.append({
1556 'format_id': format_id,
1557 'url': manifest_url,
1558 'manifest_url': manifest_url,
1559 'ext': 'flv' if bootstrap_info is not None else None,
1560 'protocol': 'f4m',
1561 'tbr': tbr,
1562 'width': width,
1563 'height': height,
1564 'vcodec': vcodec,
1565 'preference': preference,
1566 })
1567 return formats
1568
1569 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1570 return {
1571 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1572 'url': m3u8_url,
1573 'ext': ext,
1574 'protocol': 'm3u8',
1575 'preference': preference - 100 if preference else -100,
1576 'resolution': 'multiple',
1577 'format_note': 'Quality selection URL',
1578 }
1579
1580 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1581 entry_protocol='m3u8', preference=None,
1582 m3u8_id=None, note=None, errnote=None,
1583 fatal=True, live=False):
1584 res = self._download_webpage_handle(
1585 m3u8_url, video_id,
1586 note=note or 'Downloading m3u8 information',
1587 errnote=errnote or 'Failed to download m3u8 information',
1588 fatal=fatal)
1589
1590 if res is False:
1591 return []
1592
1593 m3u8_doc, urlh = res
1594 m3u8_url = urlh.geturl()
1595
1596 return self._parse_m3u8_formats(
1597 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1598 preference=preference, m3u8_id=m3u8_id, live=live)
1599
1600 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1601 entry_protocol='m3u8', preference=None,
1602 m3u8_id=None, live=False):
1603 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1604 return []
1605
1606 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1607 return []
1608
1609 formats = []
1610
1611 format_url = lambda u: (
1612 u
1613 if re.match(r'^https?://', u)
1614 else compat_urlparse.urljoin(m3u8_url, u))
1615
1616 # References:
1617 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1618 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1619 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1620
1621 # We should try extracting formats only from master playlists [1, 4.3.4],
1622 # i.e. playlists that describe available qualities. On the other hand
1623 # media playlists [1, 4.3.3] should be returned as is since they contain
1624 # just the media without qualities renditions.
1625 # Fortunately, master playlist can be easily distinguished from media
1626 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1627 # master playlist tags MUST NOT appear in a media playist and vice versa.
1628 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1629 # media playlist and MUST NOT appear in master playlist thus we can
1630 # clearly detect media playlist with this criterion.
1631
1632 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1633 return [{
1634 'url': m3u8_url,
1635 'format_id': m3u8_id,
1636 'ext': ext,
1637 'protocol': entry_protocol,
1638 'preference': preference,
1639 }]
1640
1641 groups = {}
1642 last_stream_inf = {}
1643
1644 def extract_media(x_media_line):
1645 media = parse_m3u8_attributes(x_media_line)
1646 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1647 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1648 if not (media_type and group_id and name):
1649 return
1650 groups.setdefault(group_id, []).append(media)
1651 if media_type not in ('VIDEO', 'AUDIO'):
1652 return
1653 media_url = media.get('URI')
1654 if media_url:
1655 format_id = []
1656 for v in (m3u8_id, group_id, name):
1657 if v:
1658 format_id.append(v)
1659 f = {
1660 'format_id': '-'.join(format_id),
1661 'url': format_url(media_url),
1662 'manifest_url': m3u8_url,
1663 'language': media.get('LANGUAGE'),
1664 'ext': ext,
1665 'protocol': entry_protocol,
1666 'preference': preference,
1667 }
1668 if media_type == 'AUDIO':
1669 f['vcodec'] = 'none'
1670 formats.append(f)
1671
1672 def build_stream_name():
1673 # Despite specification does not mention NAME attribute for
1674 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1675 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1676 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1677 stream_name = last_stream_inf.get('NAME')
1678 if stream_name:
1679 return stream_name
1680 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1681 # from corresponding rendition group
1682 stream_group_id = last_stream_inf.get('VIDEO')
1683 if not stream_group_id:
1684 return
1685 stream_group = groups.get(stream_group_id)
1686 if not stream_group:
1687 return stream_group_id
1688 rendition = stream_group[0]
1689 return rendition.get('NAME') or stream_group_id
1690
1691 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1692 # chance to detect video only formats when EXT-X-STREAM-INF tags
1693 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1694 for line in m3u8_doc.splitlines():
1695 if line.startswith('#EXT-X-MEDIA:'):
1696 extract_media(line)
1697
1698 for line in m3u8_doc.splitlines():
1699 if line.startswith('#EXT-X-STREAM-INF:'):
1700 last_stream_inf = parse_m3u8_attributes(line)
1701 elif line.startswith('#') or not line.strip():
1702 continue
1703 else:
1704 tbr = float_or_none(
1705 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1706 last_stream_inf.get('BANDWIDTH'), scale=1000)
1707 format_id = []
1708 if m3u8_id:
1709 format_id.append(m3u8_id)
1710 stream_name = build_stream_name()
1711 # Bandwidth of live streams may differ over time thus making
1712 # format_id unpredictable. So it's better to keep provided
1713 # format_id intact.
1714 if not live:
1715 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1716 manifest_url = format_url(line.strip())
1717 f = {
1718 'format_id': '-'.join(format_id),
1719 'url': manifest_url,
1720 'manifest_url': m3u8_url,
1721 'tbr': tbr,
1722 'ext': ext,
1723 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1724 'protocol': entry_protocol,
1725 'preference': preference,
1726 }
1727 resolution = last_stream_inf.get('RESOLUTION')
1728 if resolution:
1729 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1730 if mobj:
1731 f['width'] = int(mobj.group('width'))
1732 f['height'] = int(mobj.group('height'))
1733 # Unified Streaming Platform
1734 mobj = re.search(
1735 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1736 if mobj:
1737 abr, vbr = mobj.groups()
1738 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1739 f.update({
1740 'vbr': vbr,
1741 'abr': abr,
1742 })
1743 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1744 f.update(codecs)
1745 audio_group_id = last_stream_inf.get('AUDIO')
1746 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1747 # references a rendition group MUST have a CODECS attribute.
1748 # However, this is not always respected, for example, [2]
1749 # contains EXT-X-STREAM-INF tag which references AUDIO
1750 # rendition group but does not have CODECS and despite
1751 # referencing an audio group it represents a complete
1752 # (with audio and video) format. So, for such cases we will
1753 # ignore references to rendition groups and treat them
1754 # as complete formats.
1755 if audio_group_id and codecs and f.get('vcodec') != 'none':
1756 audio_group = groups.get(audio_group_id)
1757 if audio_group and audio_group[0].get('URI'):
1758 # TODO: update acodec for audio only formats with
1759 # the same GROUP-ID
1760 f['acodec'] = 'none'
1761 formats.append(f)
1762 last_stream_inf = {}
1763 return formats
1764
1765 @staticmethod
1766 def _xpath_ns(path, namespace=None):
1767 if not namespace:
1768 return path
1769 out = []
1770 for c in path.split('/'):
1771 if not c or c == '.':
1772 out.append(c)
1773 else:
1774 out.append('{%s}%s' % (namespace, c))
1775 return '/'.join(out)
1776
1777 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1778 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1779
1780 if smil is False:
1781 assert not fatal
1782 return []
1783
1784 namespace = self._parse_smil_namespace(smil)
1785
1786 return self._parse_smil_formats(
1787 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1788
1789 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1790 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1791 if smil is False:
1792 return {}
1793 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1794
1795 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1796 return self._download_xml(
1797 smil_url, video_id, 'Downloading SMIL file',
1798 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1799
1800 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1801 namespace = self._parse_smil_namespace(smil)
1802
1803 formats = self._parse_smil_formats(
1804 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1805 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1806
1807 video_id = os.path.splitext(url_basename(smil_url))[0]
1808 title = None
1809 description = None
1810 upload_date = None
1811 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1812 name = meta.attrib.get('name')
1813 content = meta.attrib.get('content')
1814 if not name or not content:
1815 continue
1816 if not title and name == 'title':
1817 title = content
1818 elif not description and name in ('description', 'abstract'):
1819 description = content
1820 elif not upload_date and name == 'date':
1821 upload_date = unified_strdate(content)
1822
1823 thumbnails = [{
1824 'id': image.get('type'),
1825 'url': image.get('src'),
1826 'width': int_or_none(image.get('width')),
1827 'height': int_or_none(image.get('height')),
1828 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1829
1830 return {
1831 'id': video_id,
1832 'title': title or video_id,
1833 'description': description,
1834 'upload_date': upload_date,
1835 'thumbnails': thumbnails,
1836 'formats': formats,
1837 'subtitles': subtitles,
1838 }
1839
1840 def _parse_smil_namespace(self, smil):
1841 return self._search_regex(
1842 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1843
1844 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1845 base = smil_url
1846 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1847 b = meta.get('base') or meta.get('httpBase')
1848 if b:
1849 base = b
1850 break
1851
1852 formats = []
1853 rtmp_count = 0
1854 http_count = 0
1855 m3u8_count = 0
1856
1857 srcs = []
1858 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1859 for medium in media:
1860 src = medium.get('src')
1861 if not src or src in srcs:
1862 continue
1863 srcs.append(src)
1864
1865 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1866 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1867 width = int_or_none(medium.get('width'))
1868 height = int_or_none(medium.get('height'))
1869 proto = medium.get('proto')
1870 ext = medium.get('ext')
1871 src_ext = determine_ext(src)
1872 streamer = medium.get('streamer') or base
1873
1874 if proto == 'rtmp' or streamer.startswith('rtmp'):
1875 rtmp_count += 1
1876 formats.append({
1877 'url': streamer,
1878 'play_path': src,
1879 'ext': 'flv',
1880 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1881 'tbr': bitrate,
1882 'filesize': filesize,
1883 'width': width,
1884 'height': height,
1885 })
1886 if transform_rtmp_url:
1887 streamer, src = transform_rtmp_url(streamer, src)
1888 formats[-1].update({
1889 'url': streamer,
1890 'play_path': src,
1891 })
1892 continue
1893
1894 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1895 src_url = src_url.strip()
1896
1897 if proto == 'm3u8' or src_ext == 'm3u8':
1898 m3u8_formats = self._extract_m3u8_formats(
1899 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1900 if len(m3u8_formats) == 1:
1901 m3u8_count += 1
1902 m3u8_formats[0].update({
1903 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1904 'tbr': bitrate,
1905 'width': width,
1906 'height': height,
1907 })
1908 formats.extend(m3u8_formats)
1909 elif src_ext == 'f4m':
1910 f4m_url = src_url
1911 if not f4m_params:
1912 f4m_params = {
1913 'hdcore': '3.2.0',
1914 'plugin': 'flowplayer-3.2.0.1',
1915 }
1916 f4m_url += '&' if '?' in f4m_url else '?'
1917 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1918 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1919 elif src_ext == 'mpd':
1920 formats.extend(self._extract_mpd_formats(
1921 src_url, video_id, mpd_id='dash', fatal=False))
1922 elif re.search(r'\.ism/[Mm]anifest', src_url):
1923 formats.extend(self._extract_ism_formats(
1924 src_url, video_id, ism_id='mss', fatal=False))
1925 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1926 http_count += 1
1927 formats.append({
1928 'url': src_url,
1929 'ext': ext or src_ext or 'flv',
1930 'format_id': 'http-%d' % (bitrate or http_count),
1931 'tbr': bitrate,
1932 'filesize': filesize,
1933 'width': width,
1934 'height': height,
1935 })
1936
1937 return formats
1938
1939 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1940 urls = []
1941 subtitles = {}
1942 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1943 src = textstream.get('src')
1944 if not src or src in urls:
1945 continue
1946 urls.append(src)
1947 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1948 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1949 subtitles.setdefault(lang, []).append({
1950 'url': src,
1951 'ext': ext,
1952 })
1953 return subtitles
1954
1955 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1956 xspf = self._download_xml(
1957 xspf_url, playlist_id, 'Downloading xpsf playlist',
1958 'Unable to download xspf manifest', fatal=fatal)
1959 if xspf is False:
1960 return []
1961 return self._parse_xspf(
1962 xspf, playlist_id, xspf_url=xspf_url,
1963 xspf_base_url=base_url(xspf_url))
1964
1965 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1966 NS_MAP = {
1967 'xspf': 'http://xspf.org/ns/0/',
1968 's1': 'http://static.streamone.nl/player/ns/0',
1969 }
1970
1971 entries = []
1972 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1973 title = xpath_text(
1974 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1975 description = xpath_text(
1976 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1977 thumbnail = xpath_text(
1978 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1979 duration = float_or_none(
1980 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1981
1982 formats = []
1983 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1984 format_url = urljoin(xspf_base_url, location.text)
1985 if not format_url:
1986 continue
1987 formats.append({
1988 'url': format_url,
1989 'manifest_url': xspf_url,
1990 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1991 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1992 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1993 })
1994 self._sort_formats(formats)
1995
1996 entries.append({
1997 'id': playlist_id,
1998 'title': title,
1999 'description': description,
2000 'thumbnail': thumbnail,
2001 'duration': duration,
2002 'formats': formats,
2003 })
2004 return entries
2005
2006 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2007 res = self._download_xml_handle(
2008 mpd_url, video_id,
2009 note=note or 'Downloading MPD manifest',
2010 errnote=errnote or 'Failed to download MPD manifest',
2011 fatal=fatal)
2012 if res is False:
2013 return []
2014 mpd_doc, urlh = res
2015 mpd_base_url = base_url(urlh.geturl())
2016
2017 return self._parse_mpd_formats(
2018 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2019 formats_dict=formats_dict, mpd_url=mpd_url)
2020
2021 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2022 """
2023 Parse formats from MPD manifest.
2024 References:
2025 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2026 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2027 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2028 """
2029 if mpd_doc.get('type') == 'dynamic':
2030 return []
2031
2032 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2033
2034 def _add_ns(path):
2035 return self._xpath_ns(path, namespace)
2036
2037 def is_drm_protected(element):
2038 return element.find(_add_ns('ContentProtection')) is not None
2039
2040 def extract_multisegment_info(element, ms_parent_info):
2041 ms_info = ms_parent_info.copy()
2042
2043 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2044 # common attributes and elements. We will only extract relevant
2045 # for us.
2046 def extract_common(source):
2047 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2048 if segment_timeline is not None:
2049 s_e = segment_timeline.findall(_add_ns('S'))
2050 if s_e:
2051 ms_info['total_number'] = 0
2052 ms_info['s'] = []
2053 for s in s_e:
2054 r = int(s.get('r', 0))
2055 ms_info['total_number'] += 1 + r
2056 ms_info['s'].append({
2057 't': int(s.get('t', 0)),
2058 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2059 'd': int(s.attrib['d']),
2060 'r': r,
2061 })
2062 start_number = source.get('startNumber')
2063 if start_number:
2064 ms_info['start_number'] = int(start_number)
2065 timescale = source.get('timescale')
2066 if timescale:
2067 ms_info['timescale'] = int(timescale)
2068 segment_duration = source.get('duration')
2069 if segment_duration:
2070 ms_info['segment_duration'] = float(segment_duration)
2071
2072 def extract_Initialization(source):
2073 initialization = source.find(_add_ns('Initialization'))
2074 if initialization is not None:
2075 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2076
2077 segment_list = element.find(_add_ns('SegmentList'))
2078 if segment_list is not None:
2079 extract_common(segment_list)
2080 extract_Initialization(segment_list)
2081 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2082 if segment_urls_e:
2083 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2084 else:
2085 segment_template = element.find(_add_ns('SegmentTemplate'))
2086 if segment_template is not None:
2087 extract_common(segment_template)
2088 media = segment_template.get('media')
2089 if media:
2090 ms_info['media'] = media
2091 initialization = segment_template.get('initialization')
2092 if initialization:
2093 ms_info['initialization'] = initialization
2094 else:
2095 extract_Initialization(segment_template)
2096 return ms_info
2097
2098 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2099 formats = []
2100 for period in mpd_doc.findall(_add_ns('Period')):
2101 period_duration = parse_duration(period.get('duration')) or mpd_duration
2102 period_ms_info = extract_multisegment_info(period, {
2103 'start_number': 1,
2104 'timescale': 1,
2105 })
2106 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2107 if is_drm_protected(adaptation_set):
2108 continue
2109 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2110 for representation in adaptation_set.findall(_add_ns('Representation')):
2111 if is_drm_protected(representation):
2112 continue
2113 representation_attrib = adaptation_set.attrib.copy()
2114 representation_attrib.update(representation.attrib)
2115 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2116 mime_type = representation_attrib['mimeType']
2117 content_type = mime_type.split('/')[0]
2118 if content_type == 'text':
2119 # TODO implement WebVTT downloading
2120 pass
2121 elif content_type in ('video', 'audio'):
2122 base_url = ''
2123 for element in (representation, adaptation_set, period, mpd_doc):
2124 base_url_e = element.find(_add_ns('BaseURL'))
2125 if base_url_e is not None:
2126 base_url = base_url_e.text + base_url
2127 if re.match(r'^https?://', base_url):
2128 break
2129 if mpd_base_url and not re.match(r'^https?://', base_url):
2130 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2131 mpd_base_url += '/'
2132 base_url = mpd_base_url + base_url
2133 representation_id = representation_attrib.get('id')
2134 lang = representation_attrib.get('lang')
2135 url_el = representation.find(_add_ns('BaseURL'))
2136 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2137 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2138 f = {
2139 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2140 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2141 'url': mpd_url or base_url,
2142 'manifest_url': mpd_url,
2143 'ext': mimetype2ext(mime_type),
2144 'width': int_or_none(representation_attrib.get('width')),
2145 'height': int_or_none(representation_attrib.get('height')),
2146 'tbr': float_or_none(bandwidth, 1000),
2147 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2148 'fps': int_or_none(representation_attrib.get('frameRate')),
2149 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2150 'format_note': 'DASH %s' % content_type,
2151 'filesize': filesize,
2152 'container': mimetype2ext(mime_type) + '_dash',
2153 }
2154 f.update(parse_codecs(representation_attrib.get('codecs')))
2155 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2156
2157 def prepare_template(template_name, identifiers):
2158 tmpl = representation_ms_info[template_name]
2159 # First of, % characters outside $...$ templates
2160 # must be escaped by doubling for proper processing
2161 # by % operator string formatting used further (see
2162 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2163 t = ''
2164 in_template = False
2165 for c in tmpl:
2166 t += c
2167 if c == '$':
2168 in_template = not in_template
2169 elif c == '%' and not in_template:
2170 t += c
2171 # Next, $...$ templates are translated to their
2172 # %(...) counterparts to be used with % operator
2173 t = t.replace('$RepresentationID$', representation_id)
2174 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2175 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2176 t.replace('$$', '$')
2177 return t
2178
2179 # @initialization is a regular template like @media one
2180 # so it should be handled just the same way (see
2181 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2182 if 'initialization' in representation_ms_info:
2183 initialization_template = prepare_template(
2184 'initialization',
2185 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2186 # $Time$ shall not be included for @initialization thus
2187 # only $Bandwidth$ remains
2188 ('Bandwidth', ))
2189 representation_ms_info['initialization_url'] = initialization_template % {
2190 'Bandwidth': bandwidth,
2191 }
2192
2193 def location_key(location):
2194 return 'url' if re.match(r'^https?://', location) else 'path'
2195
2196 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2197
2198 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2199 media_location_key = location_key(media_template)
2200
2201 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2202 # can't be used at the same time
2203 if '%(Number' in media_template and 's' not in representation_ms_info:
2204 segment_duration = None
2205 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2206 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2207 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2208 representation_ms_info['fragments'] = [{
2209 media_location_key: media_template % {
2210 'Number': segment_number,
2211 'Bandwidth': bandwidth,
2212 },
2213 'duration': segment_duration,
2214 } for segment_number in range(
2215 representation_ms_info['start_number'],
2216 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2217 else:
2218 # $Number*$ or $Time$ in media template with S list available
2219 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2220 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2221 representation_ms_info['fragments'] = []
2222 segment_time = 0
2223 segment_d = None
2224 segment_number = representation_ms_info['start_number']
2225
2226 def add_segment_url():
2227 segment_url = media_template % {
2228 'Time': segment_time,
2229 'Bandwidth': bandwidth,
2230 'Number': segment_number,
2231 }
2232 representation_ms_info['fragments'].append({
2233 media_location_key: segment_url,
2234 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2235 })
2236
2237 for num, s in enumerate(representation_ms_info['s']):
2238 segment_time = s.get('t') or segment_time
2239 segment_d = s['d']
2240 add_segment_url()
2241 segment_number += 1
2242 for r in range(s.get('r', 0)):
2243 segment_time += segment_d
2244 add_segment_url()
2245 segment_number += 1
2246 segment_time += segment_d
2247 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2248 # No media template
2249 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2250 # or any YouTube dashsegments video
2251 fragments = []
2252 segment_index = 0
2253 timescale = representation_ms_info['timescale']
2254 for s in representation_ms_info['s']:
2255 duration = float_or_none(s['d'], timescale)
2256 for r in range(s.get('r', 0) + 1):
2257 segment_uri = representation_ms_info['segment_urls'][segment_index]
2258 fragments.append({
2259 location_key(segment_uri): segment_uri,
2260 'duration': duration,
2261 })
2262 segment_index += 1
2263 representation_ms_info['fragments'] = fragments
2264 elif 'segment_urls' in representation_ms_info:
2265 # Segment URLs with no SegmentTimeline
2266 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2267 # https://github.com/ytdl-org/youtube-dl/pull/14844
2268 fragments = []
2269 segment_duration = float_or_none(
2270 representation_ms_info['segment_duration'],
2271 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2272 for segment_url in representation_ms_info['segment_urls']:
2273 fragment = {
2274 location_key(segment_url): segment_url,
2275 }
2276 if segment_duration:
2277 fragment['duration'] = segment_duration
2278 fragments.append(fragment)
2279 representation_ms_info['fragments'] = fragments
2280 # NB: MPD manifest may contain direct URLs to unfragmented media.
2281 # No fragments key is present in this case.
2282 if 'fragments' in representation_ms_info:
2283 f.update({
2284 'fragment_base_url': base_url,
2285 'fragments': [],
2286 'protocol': 'http_dash_segments',
2287 })
2288 if 'initialization_url' in representation_ms_info:
2289 initialization_url = representation_ms_info['initialization_url']
2290 if not f.get('url'):
2291 f['url'] = initialization_url
2292 f['fragments'].append({location_key(initialization_url): initialization_url})
2293 f['fragments'].extend(representation_ms_info['fragments'])
2294 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2295 # is not necessarily unique within a Period thus formats with
2296 # the same `format_id` are quite possible. There are numerous examples
2297 # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2298 # https://github.com/ytdl-org/youtube-dl/issues/13919)
2299 full_info = formats_dict.get(representation_id, {}).copy()
2300 full_info.update(f)
2301 formats.append(full_info)
2302 else:
2303 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2304 return formats
2305
2306 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2307 res = self._download_xml_handle(
2308 ism_url, video_id,
2309 note=note or 'Downloading ISM manifest',
2310 errnote=errnote or 'Failed to download ISM manifest',
2311 fatal=fatal)
2312 if res is False:
2313 return []
2314 ism_doc, urlh = res
2315
2316 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2317
2318 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2319 """
2320 Parse formats from ISM manifest.
2321 References:
2322 1. [MS-SSTR]: Smooth Streaming Protocol,
2323 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2324 """
2325 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2326 return []
2327
2328 duration = int(ism_doc.attrib['Duration'])
2329 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2330
2331 formats = []
2332 for stream in ism_doc.findall('StreamIndex'):
2333 stream_type = stream.get('Type')
2334 if stream_type not in ('video', 'audio'):
2335 continue
2336 url_pattern = stream.attrib['Url']
2337 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2338 stream_name = stream.get('Name')
2339 for track in stream.findall('QualityLevel'):
2340 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2341 # TODO: add support for WVC1 and WMAP
2342 if fourcc not in ('H264', 'AVC1', 'AACL'):
2343 self.report_warning('%s is not a supported codec' % fourcc)
2344 continue
2345 tbr = int(track.attrib['Bitrate']) // 1000
2346 # [1] does not mention Width and Height attributes. However,
2347 # they're often present while MaxWidth and MaxHeight are
2348 # missing, so should be used as fallbacks
2349 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2350 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2351 sampling_rate = int_or_none(track.get('SamplingRate'))
2352
2353 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2354 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2355
2356 fragments = []
2357 fragment_ctx = {
2358 'time': 0,
2359 }
2360 stream_fragments = stream.findall('c')
2361 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2362 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2363 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2364 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2365 if not fragment_ctx['duration']:
2366 try:
2367 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2368 except IndexError:
2369 next_fragment_time = duration
2370 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2371 for _ in range(fragment_repeat):
2372 fragments.append({
2373 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2374 'duration': fragment_ctx['duration'] / stream_timescale,
2375 })
2376 fragment_ctx['time'] += fragment_ctx['duration']
2377
2378 format_id = []
2379 if ism_id:
2380 format_id.append(ism_id)
2381 if stream_name:
2382 format_id.append(stream_name)
2383 format_id.append(compat_str(tbr))
2384
2385 formats.append({
2386 'format_id': '-'.join(format_id),
2387 'url': ism_url,
2388 'manifest_url': ism_url,
2389 'ext': 'ismv' if stream_type == 'video' else 'isma',
2390 'width': width,
2391 'height': height,
2392 'tbr': tbr,
2393 'asr': sampling_rate,
2394 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2395 'acodec': 'none' if stream_type == 'video' else fourcc,
2396 'protocol': 'ism',
2397 'fragments': fragments,
2398 '_download_params': {
2399 'duration': duration,
2400 'timescale': stream_timescale,
2401 'width': width or 0,
2402 'height': height or 0,
2403 'fourcc': fourcc,
2404 'codec_private_data': track.get('CodecPrivateData'),
2405 'sampling_rate': sampling_rate,
2406 'channels': int_or_none(track.get('Channels', 2)),
2407 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2408 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2409 },
2410 })
2411 return formats
2412
2413 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2414 def absolute_url(item_url):
2415 return urljoin(base_url, item_url)
2416
2417 def parse_content_type(content_type):
2418 if not content_type:
2419 return {}
2420 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2421 if ctr:
2422 mimetype, codecs = ctr.groups()
2423 f = parse_codecs(codecs)
2424 f['ext'] = mimetype2ext(mimetype)
2425 return f
2426 return {}
2427
2428 def _media_formats(src, cur_media_type, type_info={}):
2429 full_url = absolute_url(src)
2430 ext = type_info.get('ext') or determine_ext(full_url)
2431 if ext == 'm3u8':
2432 is_plain_url = False
2433 formats = self._extract_m3u8_formats(
2434 full_url, video_id, ext='mp4',
2435 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2436 preference=preference, fatal=False)
2437 elif ext == 'mpd':
2438 is_plain_url = False
2439 formats = self._extract_mpd_formats(
2440 full_url, video_id, mpd_id=mpd_id, fatal=False)
2441 else:
2442 is_plain_url = True
2443 formats = [{
2444 'url': full_url,
2445 'vcodec': 'none' if cur_media_type == 'audio' else None,
2446 }]
2447 return is_plain_url, formats
2448
2449 entries = []
2450 # amp-video and amp-audio are very similar to their HTML5 counterparts
2451 # so we wll include them right here (see
2452 # https://www.ampproject.org/docs/reference/components/amp-video)
2453 media_tags = [(media_tag, media_type, '')
2454 for media_tag, media_type
2455 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2456 media_tags.extend(re.findall(
2457 # We only allow video|audio followed by a whitespace or '>'.
2458 # Allowing more characters may end up in significant slow down (see
2459 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2460 # http://www.porntrex.com/maps/videositemap.xml).
2461 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2462 for media_tag, media_type, media_content in media_tags:
2463 media_info = {
2464 'formats': [],
2465 'subtitles': {},
2466 }
2467 media_attributes = extract_attributes(media_tag)
2468 src = media_attributes.get('src')
2469 if src:
2470 _, formats = _media_formats(src, media_type)
2471 media_info['formats'].extend(formats)
2472 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2473 if media_content:
2474 for source_tag in re.findall(r'<source[^>]+>', media_content):
2475 source_attributes = extract_attributes(source_tag)
2476 src = source_attributes.get('src')
2477 if not src:
2478 continue
2479 f = parse_content_type(source_attributes.get('type'))
2480 is_plain_url, formats = _media_formats(src, media_type, f)
2481 if is_plain_url:
2482 # res attribute is not standard but seen several times
2483 # in the wild
2484 f.update({
2485 'height': int_or_none(source_attributes.get('res')),
2486 'format_id': source_attributes.get('label'),
2487 })
2488 f.update(formats[0])
2489 media_info['formats'].append(f)
2490 else:
2491 media_info['formats'].extend(formats)
2492 for track_tag in re.findall(r'<track[^>]+>', media_content):
2493 track_attributes = extract_attributes(track_tag)
2494 kind = track_attributes.get('kind')
2495 if not kind or kind in ('subtitles', 'captions'):
2496 src = track_attributes.get('src')
2497 if not src:
2498 continue
2499 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2500 media_info['subtitles'].setdefault(lang, []).append({
2501 'url': absolute_url(src),
2502 })
2503 for f in media_info['formats']:
2504 f.setdefault('http_headers', {})['Referer'] = base_url
2505 if media_info['formats'] or media_info['subtitles']:
2506 entries.append(media_info)
2507 return entries
2508
2509 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2510 formats = []
2511 hdcore_sign = 'hdcore=3.7.0'
2512 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2513 hds_host = hosts.get('hds')
2514 if hds_host:
2515 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2516 if 'hdcore=' not in f4m_url:
2517 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2518 f4m_formats = self._extract_f4m_formats(
2519 f4m_url, video_id, f4m_id='hds', fatal=False)
2520 for entry in f4m_formats:
2521 entry.update({'extra_param_to_segment_url': hdcore_sign})
2522 formats.extend(f4m_formats)
2523 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2524 hls_host = hosts.get('hls')
2525 if hls_host:
2526 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2527 formats.extend(self._extract_m3u8_formats(
2528 m3u8_url, video_id, 'mp4', 'm3u8_native',
2529 m3u8_id='hls', fatal=False))
2530 return formats
2531
2532 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2533 query = compat_urlparse.urlparse(url).query
2534 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2535 mobj = re.search(
2536 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2537 url_base = mobj.group('url')
2538 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2539 formats = []
2540
2541 def manifest_url(manifest):
2542 m_url = '%s/%s' % (http_base_url, manifest)
2543 if query:
2544 m_url += '?%s' % query
2545 return m_url
2546
2547 if 'm3u8' not in skip_protocols:
2548 formats.extend(self._extract_m3u8_formats(
2549 manifest_url('playlist.m3u8'), video_id, 'mp4',
2550 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2551 if 'f4m' not in skip_protocols:
2552 formats.extend(self._extract_f4m_formats(
2553 manifest_url('manifest.f4m'),
2554 video_id, f4m_id='hds', fatal=False))
2555 if 'dash' not in skip_protocols:
2556 formats.extend(self._extract_mpd_formats(
2557 manifest_url('manifest.mpd'),
2558 video_id, mpd_id='dash', fatal=False))
2559 if re.search(r'(?:/smil:|\.smil)', url_base):
2560 if 'smil' not in skip_protocols:
2561 rtmp_formats = self._extract_smil_formats(
2562 manifest_url('jwplayer.smil'),
2563 video_id, fatal=False)
2564 for rtmp_format in rtmp_formats:
2565 rtsp_format = rtmp_format.copy()
2566 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2567 del rtsp_format['play_path']
2568 del rtsp_format['ext']
2569 rtsp_format.update({
2570 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2571 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2572 'protocol': 'rtsp',
2573 })
2574 formats.extend([rtmp_format, rtsp_format])
2575 else:
2576 for protocol in ('rtmp', 'rtsp'):
2577 if protocol not in skip_protocols:
2578 formats.append({
2579 'url': '%s:%s' % (protocol, url_base),
2580 'format_id': protocol,
2581 'protocol': protocol,
2582 })
2583 return formats
2584
2585 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2586 mobj = re.search(
2587 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2588 webpage)
2589 if mobj:
2590 try:
2591 jwplayer_data = self._parse_json(mobj.group('options'),
2592 video_id=video_id,
2593 transform_source=transform_source)
2594 except ExtractorError:
2595 pass
2596 else:
2597 if isinstance(jwplayer_data, dict):
2598 return jwplayer_data
2599
2600 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2601 jwplayer_data = self._find_jwplayer_data(
2602 webpage, video_id, transform_source=js_to_json)
2603 return self._parse_jwplayer_data(
2604 jwplayer_data, video_id, *args, **kwargs)
2605
2606 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2607 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2608 # JWPlayer backward compatibility: flattened playlists
2609 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2610 if 'playlist' not in jwplayer_data:
2611 jwplayer_data = {'playlist': [jwplayer_data]}
2612
2613 entries = []
2614
2615 # JWPlayer backward compatibility: single playlist item
2616 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2617 if not isinstance(jwplayer_data['playlist'], list):
2618 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2619
2620 for video_data in jwplayer_data['playlist']:
2621 # JWPlayer backward compatibility: flattened sources
2622 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2623 if 'sources' not in video_data:
2624 video_data['sources'] = [video_data]
2625
2626 this_video_id = video_id or video_data['mediaid']
2627
2628 formats = self._parse_jwplayer_formats(
2629 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2630 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2631
2632 subtitles = {}
2633 tracks = video_data.get('tracks')
2634 if tracks and isinstance(tracks, list):
2635 for track in tracks:
2636 if not isinstance(track, dict):
2637 continue
2638 track_kind = track.get('kind')
2639 if not track_kind or not isinstance(track_kind, compat_str):
2640 continue
2641 if track_kind.lower() not in ('captions', 'subtitles'):
2642 continue
2643 track_url = urljoin(base_url, track.get('file'))
2644 if not track_url:
2645 continue
2646 subtitles.setdefault(track.get('label') or 'en', []).append({
2647 'url': self._proto_relative_url(track_url)
2648 })
2649
2650 entry = {
2651 'id': this_video_id,
2652 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2653 'description': video_data.get('description'),
2654 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2655 'timestamp': int_or_none(video_data.get('pubdate')),
2656 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2657 'subtitles': subtitles,
2658 }
2659 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2660 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2661 entry.update({
2662 '_type': 'url_transparent',
2663 'url': formats[0]['url'],
2664 })
2665 else:
2666 self._sort_formats(formats)
2667 entry['formats'] = formats
2668 entries.append(entry)
2669 if len(entries) == 1:
2670 return entries[0]
2671 else:
2672 return self.playlist_result(entries)
2673
2674 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2675 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2676 urls = []
2677 formats = []
2678 for source in jwplayer_sources_data:
2679 if not isinstance(source, dict):
2680 continue
2681 source_url = urljoin(
2682 base_url, self._proto_relative_url(source.get('file')))
2683 if not source_url or source_url in urls:
2684 continue
2685 urls.append(source_url)
2686 source_type = source.get('type') or ''
2687 ext = mimetype2ext(source_type) or determine_ext(source_url)
2688 if source_type == 'hls' or ext == 'm3u8':
2689 formats.extend(self._extract_m3u8_formats(
2690 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2691 m3u8_id=m3u8_id, fatal=False))
2692 elif source_type == 'dash' or ext == 'mpd':
2693 formats.extend(self._extract_mpd_formats(
2694 source_url, video_id, mpd_id=mpd_id, fatal=False))
2695 elif ext == 'smil':
2696 formats.extend(self._extract_smil_formats(
2697 source_url, video_id, fatal=False))
2698 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2699 elif source_type.startswith('audio') or ext in (
2700 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2701 formats.append({
2702 'url': source_url,
2703 'vcodec': 'none',
2704 'ext': ext,
2705 })
2706 else:
2707 height = int_or_none(source.get('height'))
2708 if height is None:
2709 # Often no height is provided but there is a label in
2710 # format like "1080p", "720p SD", or 1080.
2711 height = int_or_none(self._search_regex(
2712 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2713 'height', default=None))
2714 a_format = {
2715 'url': source_url,
2716 'width': int_or_none(source.get('width')),
2717 'height': height,
2718 'tbr': int_or_none(source.get('bitrate')),
2719 'ext': ext,
2720 }
2721 if source_url.startswith('rtmp'):
2722 a_format['ext'] = 'flv'
2723 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2724 # of jwplayer.flash.swf
2725 rtmp_url_parts = re.split(
2726 r'((?:mp4|mp3|flv):)', source_url, 1)
2727 if len(rtmp_url_parts) == 3:
2728 rtmp_url, prefix, play_path = rtmp_url_parts
2729 a_format.update({
2730 'url': rtmp_url,
2731 'play_path': prefix + play_path,
2732 })
2733 if rtmp_params:
2734 a_format.update(rtmp_params)
2735 formats.append(a_format)
2736 return formats
2737
2738 def _live_title(self, name):
2739 """ Generate the title for a live video """
2740 now = datetime.datetime.now()
2741 now_str = now.strftime('%Y-%m-%d %H:%M')
2742 return name + ' ' + now_str
2743
2744 def _int(self, v, name, fatal=False, **kwargs):
2745 res = int_or_none(v, **kwargs)
2746 if 'get_attr' in kwargs:
2747 print(getattr(v, kwargs['get_attr']))
2748 if res is None:
2749 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2750 if fatal:
2751 raise ExtractorError(msg)
2752 else:
2753 self._downloader.report_warning(msg)
2754 return res
2755
2756 def _float(self, v, name, fatal=False, **kwargs):
2757 res = float_or_none(v, **kwargs)
2758 if res is None:
2759 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2760 if fatal:
2761 raise ExtractorError(msg)
2762 else:
2763 self._downloader.report_warning(msg)
2764 return res
2765
2766 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2767 path='/', secure=False, discard=False, rest={}, **kwargs):
2768 cookie = compat_cookiejar.Cookie(
2769 0, name, value, port, port is not None, domain, True,
2770 domain.startswith('.'), path, True, secure, expire_time,
2771 discard, None, None, rest)
2772 self._downloader.cookiejar.set_cookie(cookie)
2773
2774 def _get_cookies(self, url):
2775 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2776 req = sanitized_Request(url)
2777 self._downloader.cookiejar.add_cookie_header(req)
2778 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2779
2780 def get_testcases(self, include_onlymatching=False):
2781 t = getattr(self, '_TEST', None)
2782 if t:
2783 assert not hasattr(self, '_TESTS'), \
2784 '%s has _TEST and _TESTS' % type(self).__name__
2785 tests = [t]
2786 else:
2787 tests = getattr(self, '_TESTS', [])
2788 for t in tests:
2789 if not include_onlymatching and t.get('only_matching', False):
2790 continue
2791 t['name'] = type(self).__name__[:-len('IE')]
2792 yield t
2793
2794 def is_suitable(self, age_limit):
2795 """ Test whether the extractor is generally suitable for the given
2796 age limit (i.e. pornographic sites are not, all others usually are) """
2797
2798 any_restricted = False
2799 for tc in self.get_testcases(include_onlymatching=False):
2800 if tc.get('playlist', []):
2801 tc = tc['playlist'][0]
2802 is_restricted = age_restricted(
2803 tc.get('info_dict', {}).get('age_limit'), age_limit)
2804 if not is_restricted:
2805 return True
2806 any_restricted = any_restricted or is_restricted
2807 return not any_restricted
2808
2809 def extract_subtitles(self, *args, **kwargs):
2810 if (self._downloader.params.get('writesubtitles', False) or
2811 self._downloader.params.get('listsubtitles')):
2812 return self._get_subtitles(*args, **kwargs)
2813 return {}
2814
2815 def _get_subtitles(self, *args, **kwargs):
2816 raise NotImplementedError('This method must be implemented by subclasses')
2817
2818 @staticmethod
2819 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2820 """ Merge subtitle items for one language. Items with duplicated URLs
2821 will be dropped. """
2822 list1_urls = set([item['url'] for item in subtitle_list1])
2823 ret = list(subtitle_list1)
2824 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2825 return ret
2826
2827 @classmethod
2828 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2829 """ Merge two subtitle dictionaries, language by language. """
2830 ret = dict(subtitle_dict1)
2831 for lang in subtitle_dict2:
2832 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2833 return ret
2834
2835 def extract_automatic_captions(self, *args, **kwargs):
2836 if (self._downloader.params.get('writeautomaticsub', False) or
2837 self._downloader.params.get('listsubtitles')):
2838 return self._get_automatic_captions(*args, **kwargs)
2839 return {}
2840
2841 def _get_automatic_captions(self, *args, **kwargs):
2842 raise NotImplementedError('This method must be implemented by subclasses')
2843
2844 def mark_watched(self, *args, **kwargs):
2845 if (self._downloader.params.get('mark_watched', False) and
2846 (self._get_login_info()[0] is not None or
2847 self._downloader.params.get('cookiefile') is not None)):
2848 self._mark_watched(*args, **kwargs)
2849
2850 def _mark_watched(self, *args, **kwargs):
2851 raise NotImplementedError('This method must be implemented by subclasses')
2852
2853 def geo_verification_headers(self):
2854 headers = {}
2855 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2856 if geo_verification_proxy:
2857 headers['Ytdl-request-proxy'] = geo_verification_proxy
2858 return headers
2859
2860 def _generic_id(self, url):
2861 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2862
2863 def _generic_title(self, url):
2864 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2865
2866
2867 class SearchInfoExtractor(InfoExtractor):
2868 """
2869 Base class for paged search queries extractors.
2870 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2871 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2872 """
2873
2874 @classmethod
2875 def _make_valid_url(cls):
2876 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2877
2878 @classmethod
2879 def suitable(cls, url):
2880 return re.match(cls._make_valid_url(), url) is not None
2881
2882 def _real_extract(self, query):
2883 mobj = re.match(self._make_valid_url(), query)
2884 if mobj is None:
2885 raise ExtractorError('Invalid search query "%s"' % query)
2886
2887 prefix = mobj.group('prefix')
2888 query = mobj.group('query')
2889 if prefix == '':
2890 return self._get_n_results(query, 1)
2891 elif prefix == 'all':
2892 return self._get_n_results(query, self._MAX_RESULTS)
2893 else:
2894 n = int(prefix)
2895 if n <= 0:
2896 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2897 elif n > self._MAX_RESULTS:
2898 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2899 n = self._MAX_RESULTS
2900 return self._get_n_results(query, n)
2901
2902 def _get_n_results(self, query, n):
2903 """Get a specified number of results for a query"""
2904 raise NotImplementedError('This method must be implemented by subclasses')
2905
2906 @property
2907 def SEARCH_KEY(self):
2908 return self._SEARCH_KEY