]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[udemy] Extract asset captions
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4
PH
11import re
12import socket
13import sys
4094b6e3 14import time
1bac3455 15import math
d6983cb4 16
8c25f81b 17from ..compat import (
42939b61 18 compat_cookiejar,
799207e8 19 compat_cookies,
e9c0cdd3 20 compat_etree_fromstring,
e64b7569 21 compat_getpass,
d6983cb4 22 compat_http_client,
e9c0cdd3
YCH
23 compat_os_name,
24 compat_str,
d6983cb4 25 compat_urllib_error,
98763ee3 26 compat_urllib_parse_unquote,
15707c7e 27 compat_urllib_parse_urlencode,
41d06b04 28 compat_urllib_request,
f0b5d6af 29 compat_urlparse,
e01c3d2e 30 compat_xml_parse_error,
8c25f81b 31)
48107c19
S
32from ..downloader.f4m import (
33 get_base_url,
34 remove_encrypted_media,
35)
8c25f81b 36from ..utils import (
c342041f 37 NO_DEFAULT,
05900629 38 age_restricted,
02dc0a36 39 base_url,
08f2a92c 40 bug_reports_message,
d6983cb4
PH
41 clean_html,
42 compiled_regex_type,
70f0f5a8 43 determine_ext,
46b18f23 44 determine_protocol,
9b9c5355 45 error_to_compat_str,
d6983cb4 46 ExtractorError,
46b18f23 47 extract_attributes,
97f4aecf 48 fix_xml_ampersands,
b14f3a4c 49 float_or_none,
773f291d
S
50 GeoRestrictedError,
51 GeoUtils,
31bb8d3f 52 int_or_none,
a4a554a7 53 js_to_json,
46b18f23
JH
54 mimetype2ext,
55 orderedSet,
56 parse_codecs,
57 parse_duration,
4ca2a3cf 58 parse_iso8601,
46b18f23 59 parse_m3u8_attributes,
55b3e45b 60 RegexNotFoundError,
5c2266df 61 sanitized_Request,
46b18f23 62 sanitize_filename,
f38de77f 63 unescapeHTML,
647eab45 64 unified_strdate,
6b3a3098 65 unified_timestamp,
46b18f23
JH
66 update_Request,
67 update_url_query,
68 urljoin,
a107193e 69 url_basename,
a6571f10 70 xpath_element,
8d6765cf
S
71 xpath_text,
72 xpath_with_ns,
d6983cb4 73)
c342041f 74
d6983cb4
PH
75
76class InfoExtractor(object):
77 """Information Extractor class.
78
79 Information extractors are the classes that, given a URL, extract
80 information about the video (or videos) the URL refers to. This
81 information includes the real video URL, the video title, author and
82 others. The information is stored in a dictionary which is then
5d380852 83 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
84 information possibly downloading the video to the file system, among
85 other possible outcomes.
86
cf0649f8 87 The type field determines the type of the result.
fed5d032
PH
88 By far the most common value (and the default if _type is missing) is
89 "video", which indicates a single video.
90
91 For a video, the dictionaries must include the following fields:
d6983cb4
PH
92
93 id: Video identifier.
d6983cb4 94 title: Video title, unescaped.
d67b0b15 95
f49d89ee 96 Additionally, it must contain either a formats entry or a url one:
d67b0b15 97
f49d89ee
PH
98 formats: A list of dictionaries for each format available, ordered
99 from worst to best quality.
100
101 Potential fields:
86f4d14f
S
102 * url Mandatory. The URL of the video file
103 * manifest_url
104 The URL of the manifest file in case of
105 fragmented media (DASH, hls, hds)
10952eb2 106 * ext Will be calculated from URL if missing
d67b0b15
PH
107 * format A human-readable description of the format
108 ("mp4 container with h264/opus").
109 Calculated from the format_id, width, height.
110 and format_note fields if missing.
111 * format_id A short description of the format
5d4f3985
PH
112 ("mp4_h264_opus" or "19").
113 Technically optional, but strongly recommended.
d67b0b15
PH
114 * format_note Additional info about the format
115 ("3D" or "DASH video")
116 * width Width of the video, if known
117 * height Height of the video, if known
f49d89ee 118 * resolution Textual description of width and height
7217e148 119 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
120 * abr Average audio bitrate in KBit/s
121 * acodec Name of the audio codec in use
dd27fd17 122 * asr Audio sampling rate in Hertz
d67b0b15 123 * vbr Average video bitrate in KBit/s
fbb21cf5 124 * fps Frame rate
d67b0b15 125 * vcodec Name of the video codec in use
1394ce65 126 * container Name of the container format
d67b0b15 127 * filesize The number of bytes, if known in advance
9732d77e 128 * filesize_approx An estimate for the number of bytes
d67b0b15 129 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
130 * protocol The protocol that will be used for the actual
131 download, lower-case.
b04b8852 132 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 133 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
134 * fragment_base_url
135 Base URL for fragments. Each fragment's path
136 value (if present) will be relative to
137 this URL.
138 * fragments A list of fragments of a fragmented media.
139 Each fragment entry must contain either an url
140 or a path. If an url is present it should be
141 considered by a client. Otherwise both path and
142 fragment_base_url must be present. Here is
143 the list of all potential fields:
144 * "url" - fragment's URL
145 * "path" - fragment's path relative to
146 fragment_base_url
a0d5077c
S
147 * "duration" (optional, int or float)
148 * "filesize" (optional, int)
f49d89ee 149 * preference Order number of this format. If this field is
08d13955 150 present and not None, the formats get sorted
38d63d84 151 by this field, regardless of all other values.
f49d89ee
PH
152 -1 for default (order by other properties),
153 -2 or smaller for less than default.
e65566a9
PH
154 < -1000 to hide the format (if there is
155 another one which is strictly better)
32f90364
PH
156 * language Language code, e.g. "de" or "en-US".
157 * language_preference Is this in the language mentioned in
158 the URL?
aff2f4f4
PH
159 10 if it's what the URL is about,
160 -1 for default (don't know),
161 -10 otherwise, other values reserved for now.
5d73273f
PH
162 * quality Order number of the video quality of this
163 format, irrespective of the file format.
164 -1 for default (order by other properties),
165 -2 or smaller for less than default.
c64ed2a3
PH
166 * source_preference Order number for this video source
167 (quality takes higher priority)
168 -1 for default (order by other properties),
169 -2 or smaller for less than default.
d769be6c
PH
170 * http_headers A dictionary of additional HTTP headers
171 to add to the request.
6271f1ca 172 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
173 video's pixels are not square.
174 width : height ratio as float.
175 * no_resume The server does not support resuming the
176 (HTTP or RTMP) download. Boolean.
00c97e3e
S
177 * downloader_options A dictionary of downloader options as
178 described in FileDownloader
3dee7826 179
c0ba0f48 180 url: Final video URL.
d6983cb4 181 ext: Video filename extension.
d67b0b15
PH
182 format: The video format, defaults to ext (used for --get-format)
183 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 184
d6983cb4
PH
185 The following fields are optional:
186
f5e43bc6 187 alt_title: A secondary title of the video.
0afef30b
PH
188 display_id An alternative identifier for the video, not necessarily
189 unique, but available before title. Typically, id is
190 something like "4234987", title "Dancing naked mole rats",
191 and display_id "dancing-naked-mole-rats"
d5519808 192 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 193 * "id" (optional, string) - Thumbnail format ID
d5519808 194 * "url"
cfb56d1a 195 * "preference" (optional, int) - quality of the image
d5519808
PH
196 * "width" (optional, int)
197 * "height" (optional, int)
198 * "resolution" (optional, string "{width}x{height"},
199 deprecated)
2de624fd 200 * "filesize" (optional, int)
d6983cb4 201 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 202 description: Full video description.
d6983cb4 203 uploader: Full name of the video uploader.
2bc0c46f 204 license: License name the video is licensed under.
8a92e51c 205 creator: The creator of the video.
8aab976b 206 release_date: The date (YYYYMMDD) when the video was released.
955c4514 207 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 208 upload_date: Video upload date (YYYYMMDD).
955c4514 209 If not explicitly set, calculated from timestamp.
d6983cb4 210 uploader_id: Nickname or id of the video uploader.
7bcd2830 211 uploader_url: Full URL to a personal webpage of the video uploader.
da9ec3b9 212 location: Physical location where the video was filmed.
a504ced0 213 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
214 {tag: subformats}. "tag" is usually a language code, and
215 "subformats" is a list sorted from lower to higher
216 preference, each element is a dictionary with the "ext"
217 entry and one of:
a504ced0 218 * "data": The subtitles file contents
10952eb2 219 * "url": A URL pointing to the subtitles file
4bba3716 220 "ext" will be calculated from URL if missing
360e1ca5
JMF
221 automatic_captions: Like 'subtitles', used by the YoutubeIE for
222 automatically generated captions
62d231c0 223 duration: Length of the video in seconds, as an integer or float.
f3d29461 224 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
225 like_count: Number of positive ratings of the video
226 dislike_count: Number of negative ratings of the video
02835c6b 227 repost_count: Number of reposts of the video
2d30521a 228 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 229 comment_count: Number of comments on the video
dd622d7c
PH
230 comments: A list of comments, each with one or more of the following
231 properties (all but one of text or html optional):
232 * "author" - human-readable name of the comment author
233 * "author_id" - user ID of the comment author
234 * "id" - Comment ID
235 * "html" - Comment as HTML
236 * "text" - Plain text of the comment
237 * "timestamp" - UNIX timestamp of comment
238 * "parent" - ID of the comment this one is replying to.
239 Set to "root" to indicate that this is a
240 comment to the original video.
8dbe9899 241 age_limit: Age restriction for the video, as an integer (years)
10952eb2 242 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
243 should allow to get the same result again. (It will be set
244 by YoutubeDL if it's missing)
ad3bc6ac
PH
245 categories: A list of categories that the video falls in, for example
246 ["Sports", "Berlin"]
864f24bd 247 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
248 is_live: True, False, or None (=unknown). Whether this video is a
249 live stream that goes on instead of a fixed-length video.
7c80519c 250 start_time: Time in seconds where the reproduction should start, as
10952eb2 251 specified in the URL.
297a564b 252 end_time: Time in seconds where the reproduction should end, as
10952eb2 253 specified in the URL.
55949fed 254 chapters: A list of dictionaries, with the following entries:
255 * "start_time" - The start time of the chapter in seconds
256 * "end_time" - The end time of the chapter in seconds
257 * "title" (optional, string)
d6983cb4 258
7109903e
S
259 The following fields should only be used when the video belongs to some logical
260 chapter or section:
261
262 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
263 chapter_number: Number of the chapter the video belongs to, as an integer.
264 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
265
266 The following fields should only be used when the video is an episode of some
8d76bdf1 267 series, programme or podcast:
7109903e
S
268
269 series: Title of the series or programme the video episode belongs to.
270 season: Title of the season the video episode belongs to.
27bfd4e5
S
271 season_number: Number of the season the video episode belongs to, as an integer.
272 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
273 episode: Title of the video episode. Unlike mandatory video title field,
274 this field should denote the exact title of the video episode
275 without any kind of decoration.
27bfd4e5
S
276 episode_number: Number of the video episode within a season, as an integer.
277 episode_id: Id of the video episode, as a unicode string.
7109903e 278
7a93ab5f
S
279 The following fields should only be used when the media is a track or a part of
280 a music album:
281
282 track: Title of the track.
283 track_number: Number of the track within an album or a disc, as an integer.
284 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
285 as a unicode string.
286 artist: Artist(s) of the track.
287 genre: Genre(s) of the track.
288 album: Title of the album the track belongs to.
289 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
290 album_artist: List of all artists appeared on the album (e.g.
291 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
292 and compilations).
293 disc_number: Number of the disc or other physical medium the track belongs to,
294 as an integer.
295 release_year: Year (YYYY) when the album was released.
296
deefc05b 297 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 298
d838b1bd
PH
299 Unless mentioned otherwise, None is equivalent to absence of information.
300
fed5d032
PH
301
302 _type "playlist" indicates multiple videos.
b82f815f
PH
303 There must be a key "entries", which is a list, an iterable, or a PagedList
304 object, each element of which is a valid dictionary by this specification.
fed5d032 305
c10c9323
S
306 Additionally, playlists can have "id", "title", "description", "uploader",
307 "uploader_id", "uploader_url" attributes with the same semantics as videos
308 (see above).
fed5d032
PH
309
310
311 _type "multi_video" indicates that there are multiple videos that
312 form a single show, for examples multiple acts of an opera or TV episode.
313 It must have an entries key like a playlist and contain all the keys
314 required for a video at the same time.
315
316
317 _type "url" indicates that the video must be extracted from another
318 location, possibly by a different extractor. Its only required key is:
319 "url" - the next URL to extract.
f58766ce
PH
320 The key "ie_key" can be set to the class name (minus the trailing "IE",
321 e.g. "Youtube") if the extractor class is known in advance.
322 Additionally, the dictionary may have any properties of the resolved entity
323 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
324 known ahead of time.
325
326
327 _type "url_transparent" entities have the same specification as "url", but
328 indicate that the given additional information is more precise than the one
329 associated with the resolved URL.
330 This is useful when a site employs a video service that hosts the video and
331 its technical metadata, but that video service does not embed a useful
332 title, description etc.
333
334
d6983cb4
PH
335 Subclasses of this one should re-define the _real_initialize() and
336 _real_extract() methods and define a _VALID_URL regexp.
337 Probably, they should also be added to the list of extractors.
338
4248dad9 339 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
340 geo restriction bypass mechanisms for a particular extractor.
341 Though it won't disable explicit geo restriction bypass based on
4248dad9
S
342 country code provided with geo_bypass_country. (experimental)
343
344 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
345 countries for this extractor. One of these countries will be used by
346 geo restriction bypass mechanism right away in order to bypass
347 geo restriction, of course, if the mechanism is not disabled. (experimental)
773f291d 348
5f95927a
S
349 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
350 IP blocks in CIDR notation for this extractor. One of these IP blocks
351 will be used by geo restriction bypass mechanism similarly
352 to _GEO_COUNTRIES. (experimental)
353
3ccdde8c
S
354 NB: both these geo attributes are experimental and may change in future
355 or be completely removed.
356
d6983cb4
PH
357 Finally, the _WORKING attribute should be set to False for broken IEs
358 in order to warn the users and skip the tests.
359 """
360
361 _ready = False
362 _downloader = None
773f291d 363 _x_forwarded_for_ip = None
4248dad9
S
364 _GEO_BYPASS = True
365 _GEO_COUNTRIES = None
5f95927a 366 _GEO_IP_BLOCKS = None
d6983cb4
PH
367 _WORKING = True
368
369 def __init__(self, downloader=None):
370 """Constructor. Receives an optional downloader."""
371 self._ready = False
773f291d 372 self._x_forwarded_for_ip = None
d6983cb4
PH
373 self.set_downloader(downloader)
374
375 @classmethod
376 def suitable(cls, url):
377 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
378
379 # This does not use has/getattr intentionally - we want to know whether
380 # we have cached the regexp for *this* class, whereas getattr would also
381 # match the superclass
382 if '_VALID_URL_RE' not in cls.__dict__:
383 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
384 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 385
ed9266db
PH
386 @classmethod
387 def _match_id(cls, url):
388 if '_VALID_URL_RE' not in cls.__dict__:
389 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
390 m = cls._VALID_URL_RE.match(url)
391 assert m
1afd0b0d 392 return compat_str(m.group('id'))
ed9266db 393
d6983cb4
PH
394 @classmethod
395 def working(cls):
396 """Getter method for _WORKING."""
397 return cls._WORKING
398
399 def initialize(self):
400 """Initializes an instance (authentication, etc)."""
5f95927a
S
401 self._initialize_geo_bypass({
402 'countries': self._GEO_COUNTRIES,
403 'ip_blocks': self._GEO_IP_BLOCKS,
404 })
4248dad9
S
405 if not self._ready:
406 self._real_initialize()
407 self._ready = True
408
5f95927a 409 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
410 """
411 Initialize geo restriction bypass mechanism.
412
413 This method is used to initialize geo bypass mechanism based on faking
414 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 415 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
416 IP will be passed as X-Forwarded-For HTTP header in all subsequent
417 HTTP requests.
e39b5d4a
S
418
419 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
420 during the instance initialization with _GEO_COUNTRIES and
421 _GEO_IP_BLOCKS.
e39b5d4a 422
5f95927a 423 You may also manually call it from extractor's code if geo bypass
e39b5d4a 424 information is not available beforehand (e.g. obtained during
5f95927a
S
425 extraction) or due to some other reason. In this case you should pass
426 this information in geo bypass context passed as first argument. It may
427 contain following fields:
428
429 countries: List of geo unrestricted countries (similar
430 to _GEO_COUNTRIES)
431 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
432 (similar to _GEO_IP_BLOCKS)
433
e39b5d4a 434 """
773f291d 435 if not self._x_forwarded_for_ip:
5f95927a
S
436
437 # Geo bypass mechanism is explicitly disabled by user
438 if not self._downloader.params.get('geo_bypass', True):
439 return
440
441 if not geo_bypass_context:
442 geo_bypass_context = {}
443
444 # Backward compatibility: previously _initialize_geo_bypass
445 # expected a list of countries, some 3rd party code may still use
446 # it this way
447 if isinstance(geo_bypass_context, (list, tuple)):
448 geo_bypass_context = {
449 'countries': geo_bypass_context,
450 }
451
452 # The whole point of geo bypass mechanism is to fake IP
453 # as X-Forwarded-For HTTP header based on some IP block or
454 # country code.
455
456 # Path 1: bypassing based on IP block in CIDR notation
457
458 # Explicit IP block specified by user, use it right away
459 # regardless of whether extractor is geo bypassable or not
460 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
461
462 # Otherwise use random IP block from geo bypass context but only
463 # if extractor is known as geo bypassable
464 if not ip_block:
465 ip_blocks = geo_bypass_context.get('ip_blocks')
466 if self._GEO_BYPASS and ip_blocks:
467 ip_block = random.choice(ip_blocks)
468
469 if ip_block:
470 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
471 if self._downloader.params.get('verbose', False):
472 self._downloader.to_screen(
473 '[debug] Using fake IP %s as X-Forwarded-For.'
474 % self._x_forwarded_for_ip)
475 return
476
477 # Path 2: bypassing based on country code
478
479 # Explicit country code specified by user, use it right away
480 # regardless of whether extractor is geo bypassable or not
481 country = self._downloader.params.get('geo_bypass_country', None)
482
483 # Otherwise use random country code from geo bypass context but
484 # only if extractor is known as geo bypassable
485 if not country:
486 countries = geo_bypass_context.get('countries')
487 if self._GEO_BYPASS and countries:
488 country = random.choice(countries)
489
490 if country:
491 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
4248dad9 492 if self._downloader.params.get('verbose', False):
6a9cb295 493 self._downloader.to_screen(
eea0716c 494 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
5f95927a 495 % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
496
497 def extract(self, url):
498 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 499 try:
773f291d
S
500 for _ in range(2):
501 try:
502 self.initialize()
0016b84e
S
503 ie_result = self._real_extract(url)
504 if self._x_forwarded_for_ip:
505 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
506 return ie_result
773f291d 507 except GeoRestrictedError as e:
4248dad9
S
508 if self.__maybe_fake_ip_and_retry(e.countries):
509 continue
773f291d 510 raise
3a5bcd03
PH
511 except ExtractorError:
512 raise
513 except compat_http_client.IncompleteRead as e:
dfb1b146 514 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 515 except (KeyError, StopIteration) as e:
dfb1b146 516 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 517
4248dad9
S
518 def __maybe_fake_ip_and_retry(self, countries):
519 if (not self._downloader.params.get('geo_bypass_country', None) and
520 self._GEO_BYPASS and
521 self._downloader.params.get('geo_bypass', True) and
522 not self._x_forwarded_for_ip and
523 countries):
eea0716c
S
524 country_code = random.choice(countries)
525 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
526 if self._x_forwarded_for_ip:
527 self.report_warning(
eea0716c
S
528 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
529 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
530 return True
531 return False
532
d6983cb4
PH
533 def set_downloader(self, downloader):
534 """Sets the downloader for this IE."""
535 self._downloader = downloader
536
537 def _real_initialize(self):
538 """Real initialization process. Redefine in subclasses."""
539 pass
540
541 def _real_extract(self, url):
542 """Real extraction process. Redefine in subclasses."""
543 pass
544
56c73665
JMF
545 @classmethod
546 def ie_key(cls):
547 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 548 return compat_str(cls.__name__[:-2])
56c73665 549
d6983cb4
PH
550 @property
551 def IE_NAME(self):
dc519b54 552 return compat_str(type(self).__name__[:-2])
d6983cb4 553
41d06b04 554 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
d6983cb4
PH
555 """ Returns the response handle """
556 if note is None:
557 self.report_download_webpage(video_id)
558 elif note is not False:
7cc3570e 559 if video_id is None:
f1a9d64e 560 self.to_screen('%s' % (note,))
7cc3570e 561 else:
f1a9d64e 562 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
563
564 # Some sites check X-Forwarded-For HTTP header in order to figure out
565 # the origin of the client behind proxy. This allows bypassing geo
566 # restriction by faking this header's value to IP that belongs to some
567 # geo unrestricted country. We will do so once we encounter any
568 # geo restriction error.
569 if self._x_forwarded_for_ip:
570 if 'X-Forwarded-For' not in headers:
571 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
572
41d06b04
S
573 if isinstance(url_or_request, compat_urllib_request.Request):
574 url_or_request = update_Request(
575 url_or_request, data=data, headers=headers, query=query)
576 else:
cdfee168 577 if query:
578 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 579 if data is not None or headers:
41d06b04 580 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 581 try:
dca08720 582 return self._downloader.urlopen(url_or_request)
d6983cb4 583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
584 if errnote is False:
585 return False
d6983cb4 586 if errnote is None:
f1a9d64e 587 errnote = 'Unable to download webpage'
7f8b2714 588
9b9c5355 589 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
590 if fatal:
591 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
592 else:
593 self._downloader.report_warning(errmsg)
594 return False
d6983cb4 595
41d06b04 596 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
d6983cb4 597 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
598 # Strip hashes from the URL (#1038)
599 if isinstance(url_or_request, (compat_str, str)):
600 url_or_request = url_or_request.partition('#')[0]
601
cdfee168 602 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
7cc3570e
PH
603 if urlh is False:
604 assert not fatal
605 return False
c9a77969 606 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
607 return (content, urlh)
608
c9a77969
YCH
609 @staticmethod
610 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
611 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
612 if m:
613 encoding = m.group(1)
614 else:
0d75ae2c 615 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
616 webpage_bytes[:1024])
617 if m:
618 encoding = m.group(1).decode('ascii')
b60016e8
PH
619 elif webpage_bytes.startswith(b'\xff\xfe'):
620 encoding = 'utf-16'
f143d86a
PH
621 else:
622 encoding = 'utf-8'
c9a77969
YCH
623
624 return encoding
625
4457823d
S
626 def __check_blocked(self, content):
627 first_block = content[:512]
628 if ('<title>Access to this site is blocked</title>' in content and
629 'Websense' in first_block):
630 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
631 blocked_iframe = self._html_search_regex(
632 r'<iframe src="([^"]+)"', content,
633 'Websense information URL', default=None)
634 if blocked_iframe:
635 msg += ' Visit %s for more details' % blocked_iframe
636 raise ExtractorError(msg, expected=True)
637 if '<title>The URL you requested has been blocked</title>' in first_block:
638 msg = (
639 'Access to this webpage has been blocked by Indian censorship. '
640 'Use a VPN or proxy server (with --proxy) to route around it.')
641 block_msg = self._html_search_regex(
642 r'</h1><p>(.*?)</p>',
643 content, 'block message', default=None)
644 if block_msg:
645 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
646 raise ExtractorError(msg, expected=True)
647 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
648 'blocklist.rkn.gov.ru' in content):
649 raise ExtractorError(
650 'Access to this webpage has been blocked by decision of the Russian government. '
651 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
652 expected=True)
653
c9a77969
YCH
654 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
655 content_type = urlh.headers.get('Content-Type', '')
656 webpage_bytes = urlh.read()
657 if prefix is not None:
658 webpage_bytes = prefix + webpage_bytes
659 if not encoding:
660 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4 661 if self._downloader.params.get('dump_intermediate_pages', False):
f610dbb0 662 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
663 dump = base64.b64encode(webpage_bytes).decode('ascii')
664 self._downloader.to_screen(dump)
d41e6efc 665 if self._downloader.params.get('write_pages', False):
f610dbb0 666 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 667 if len(basen) > 240:
f1a9d64e 668 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
669 basen = basen[:240 - len(h)] + h
670 raw_filename = basen + '.dump'
d41e6efc 671 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 672 self.to_screen('Saving request to ' + filename)
5f58165d
S
673 # Working around MAX_PATH limitation on Windows (see
674 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 675 if compat_os_name == 'nt':
5f58165d
S
676 absfilepath = os.path.abspath(filename)
677 if len(absfilepath) > 259:
678 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
679 with open(filename, 'wb') as outf:
680 outf.write(webpage_bytes)
681
ec0fafbb
AA
682 try:
683 content = webpage_bytes.decode(encoding, 'replace')
684 except LookupError:
685 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 686
4457823d 687 self.__check_blocked(content)
2410c43d 688
23be51d8 689 return content
d6983cb4 690
41d06b04 691 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
d6983cb4 692 """ Returns the data of the page as a string """
995ad69c
TF
693 success = False
694 try_count = 0
695 while success is False:
696 try:
cdfee168 697 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
995ad69c
TF
698 success = True
699 except compat_http_client.IncompleteRead as e:
700 try_count += 1
701 if try_count >= tries:
702 raise e
703 self._sleep(timeout, video_id)
7cc3570e
PH
704 if res is False:
705 return res
706 else:
707 content, _ = res
708 return content
d6983cb4 709
e0d198c1
S
710 def _download_xml_handle(
711 self, url_or_request, video_id, note='Downloading XML',
712 errnote='Unable to download XML', transform_source=None,
713 fatal=True, encoding=None, data=None, headers={}, query={}):
714 """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
715 res = self._download_webpage_handle(
716 url_or_request, video_id, note, errnote, fatal=fatal,
717 encoding=encoding, data=data, headers=headers, query=query)
718 if res is False:
719 return res
720 xml_string, urlh = res
721 return self._parse_xml(
722 xml_string, video_id, transform_source=transform_source,
723 fatal=fatal), urlh
724
2a275ab0 725 def _download_xml(self, url_or_request, video_id,
f1a9d64e 726 note='Downloading XML', errnote='Unable to download XML',
e01c3d2e
S
727 transform_source=None, fatal=True, encoding=None,
728 data=None, headers={}, query={}):
267ed0c5 729 """Return the xml as an xml.etree.ElementTree.Element"""
e0d198c1
S
730 res = self._download_xml_handle(
731 url_or_request, video_id, note=note, errnote=errnote,
732 transform_source=transform_source, fatal=fatal, encoding=encoding,
733 data=data, headers=headers, query=query)
734 return res if res is False else res[0]
e01c3d2e
S
735
736 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
737 if transform_source:
738 xml_string = transform_source(xml_string)
e01c3d2e
S
739 try:
740 return compat_etree_fromstring(xml_string.encode('utf-8'))
741 except compat_xml_parse_error as ve:
742 errmsg = '%s: Failed to parse XML ' % video_id
743 if fatal:
744 raise ExtractorError(errmsg, cause=ve)
745 else:
746 self.report_warning(errmsg + str(ve))
267ed0c5 747
0fe7783e
S
748 def _download_json_handle(
749 self, url_or_request, video_id, note='Downloading JSON metadata',
750 errnote='Unable to download JSON metadata', transform_source=None,
751 fatal=True, encoding=None, data=None, headers={}, query={}):
752 """Return a tuple (JSON object, URL handle)"""
753 res = self._download_webpage_handle(
c9a77969 754 url_or_request, video_id, note, errnote, fatal=fatal,
cdfee168 755 encoding=encoding, data=data, headers=headers, query=query)
0fe7783e
S
756 if res is False:
757 return res
758 json_string, urlh = res
ebb64199 759 return self._parse_json(
0fe7783e
S
760 json_string, video_id, transform_source=transform_source,
761 fatal=fatal), urlh
762
763 def _download_json(
764 self, url_or_request, video_id, note='Downloading JSON metadata',
765 errnote='Unable to download JSON metadata', transform_source=None,
766 fatal=True, encoding=None, data=None, headers={}, query={}):
767 res = self._download_json_handle(
768 url_or_request, video_id, note=note, errnote=errnote,
769 transform_source=transform_source, fatal=fatal, encoding=encoding,
770 data=data, headers=headers, query=query)
771 return res if res is False else res[0]
ebb64199
TF
772
773 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
774 if transform_source:
775 json_string = transform_source(json_string)
3d3538e4
PH
776 try:
777 return json.loads(json_string)
778 except ValueError as ve:
e7b6d122
PH
779 errmsg = '%s: Failed to parse JSON ' % video_id
780 if fatal:
781 raise ExtractorError(errmsg, cause=ve)
782 else:
783 self.report_warning(errmsg + str(ve))
3d3538e4 784
f45f96f8 785 def report_warning(self, msg, video_id=None):
f1a9d64e 786 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 787 self._downloader.report_warning(
f1a9d64e 788 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 789
d6983cb4
PH
790 def to_screen(self, msg):
791 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 792 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
793
794 def report_extraction(self, id_or_name):
795 """Report information extraction."""
f1a9d64e 796 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
797
798 def report_download_webpage(self, video_id):
799 """Report webpage download."""
f1a9d64e 800 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
801
802 def report_age_confirmation(self):
803 """Report attempt to confirm age."""
f1a9d64e 804 self.to_screen('Confirming age')
d6983cb4 805
fc79158d
JMF
806 def report_login(self):
807 """Report attempt to log in."""
f1a9d64e 808 self.to_screen('Logging in')
fc79158d 809
43e7d3c9
S
810 @staticmethod
811 def raise_login_required(msg='This video is only available for registered users'):
812 raise ExtractorError(
813 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
814 expected=True)
815
c430802e 816 @staticmethod
773f291d
S
817 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
818 raise GeoRestrictedError(msg, countries=countries)
c430802e 819
5f6a1245 820 # Methods for following #608
c0d0b01f 821 @staticmethod
830d53bf 822 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 823 """Returns a URL that points to a page that should be processed"""
5f6a1245 824 # TODO: ie should be the class used for getting the info
d6983cb4
PH
825 video_info = {'_type': 'url',
826 'url': url,
827 'ie_key': ie}
7012b23c
PH
828 if video_id is not None:
829 video_info['id'] = video_id
830d53bf
S
830 if video_title is not None:
831 video_info['title'] = video_title
d6983cb4 832 return video_info
5f6a1245 833
749ca5ec
S
834 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
835 urls = orderedSet(
46b18f23
JH
836 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
837 for m in matches)
838 return self.playlist_result(
749ca5ec 839 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 840
c0d0b01f 841 @staticmethod
acf5cbfe 842 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
843 """Returns a playlist"""
844 video_info = {'_type': 'playlist',
845 'entries': entries}
846 if playlist_id:
847 video_info['id'] = playlist_id
848 if playlist_title:
849 video_info['title'] = playlist_title
acf5cbfe
S
850 if playlist_description:
851 video_info['description'] = playlist_description
d6983cb4
PH
852 return video_info
853
c342041f 854 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
855 """
856 Perform a regex search on the given string, using a single or a list of
857 patterns returning the first matching group.
858 In case of failure return a default value or raise a WARNING or a
55b3e45b 859 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
860 """
861 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
862 mobj = re.search(pattern, string, flags)
863 else:
864 for p in pattern:
865 mobj = re.search(p, string, flags)
c3415d1b
PH
866 if mobj:
867 break
d6983cb4 868
e9c0cdd3 869 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 870 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
871 else:
872 _name = name
873
874 if mobj:
711ede6e
PH
875 if group is None:
876 # return the first matching group
877 return next(g for g in mobj.groups() if g is not None)
878 else:
879 return mobj.group(group)
c342041f 880 elif default is not NO_DEFAULT:
d6983cb4
PH
881 return default
882 elif fatal:
f1a9d64e 883 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 884 else:
08f2a92c 885 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
886 return None
887
c342041f 888 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
889 """
890 Like _search_regex, but strips HTML tags and unescapes entities.
891 """
711ede6e 892 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
893 if res:
894 return clean_html(res).strip()
895 else:
896 return res
897
2118fdd1
RA
898 def _get_netrc_login_info(self, netrc_machine=None):
899 username = None
900 password = None
901 netrc_machine = netrc_machine or self._NETRC_MACHINE
902
903 if self._downloader.params.get('usenetrc', False):
904 try:
905 info = netrc.netrc().authenticators(netrc_machine)
906 if info is not None:
907 username = info[0]
908 password = info[2]
909 else:
dcce092e
S
910 raise netrc.NetrcParseError(
911 'No authenticators for %s' % netrc_machine)
2118fdd1 912 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
913 self._downloader.report_warning(
914 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 915
dcce092e 916 return username, password
2118fdd1 917
1b6712ab 918 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 919 """
cf0649f8 920 Get the login info as (username, password)
32443dd3
S
921 First look for the manually specified credentials using username_option
922 and password_option as keys in params dictionary. If no such credentials
923 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
924 value.
fc79158d
JMF
925 If there's no info available, return (None, None)
926 """
927 if self._downloader is None:
928 return (None, None)
929
fc79158d
JMF
930 downloader_params = self._downloader.params
931
932 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
933 if downloader_params.get(username_option) is not None:
934 username = downloader_params[username_option]
935 password = downloader_params[password_option]
2118fdd1 936 else:
1b6712ab 937 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 938
2133565c 939 return username, password
fc79158d 940
e64b7569 941 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 942 """
943 Get the two-factor authentication info
944 TODO - asking the user will be required for sms/phone verify
945 currently just uses the command line option
946 If there's no info available, return None
947 """
948 if self._downloader is None:
83317f69 949 return None
950 downloader_params = self._downloader.params
951
d800609c 952 if downloader_params.get('twofactor') is not None:
83317f69 953 return downloader_params['twofactor']
954
e64b7569 955 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 956
46720279
JMF
957 # Helper functions for extracting OpenGraph info
958 @staticmethod
ab2d5247 959 def _og_regexes(prop):
448ef1f3 960 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
961 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
962 % {'prop': re.escape(prop)})
78fb87b2 963 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 964 return [
78fb87b2
JMF
965 template % (property_re, content_re),
966 template % (content_re, property_re),
ab2d5247 967 ]
46720279 968
864f24bd
S
969 @staticmethod
970 def _meta_regex(prop):
971 return r'''(?isx)<meta
8b9848ac 972 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
973 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
974
3c4e6d83 975 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
976 if not isinstance(prop, (list, tuple)):
977 prop = [prop]
46720279 978 if name is None:
b070564e
S
979 name = 'OpenGraph %s' % prop[0]
980 og_regexes = []
981 for p in prop:
982 og_regexes.extend(self._og_regexes(p))
983 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
984 if escaped is None:
985 return None
986 return unescapeHTML(escaped)
46720279
JMF
987
988 def _og_search_thumbnail(self, html, **kargs):
10952eb2 989 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
990
991 def _og_search_description(self, html, **kargs):
992 return self._og_search_property('description', html, fatal=False, **kargs)
993
994 def _og_search_title(self, html, **kargs):
995 return self._og_search_property('title', html, **kargs)
996
8ffa13e0 997 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
998 regexes = self._og_regexes('video') + self._og_regexes('video:url')
999 if secure:
1000 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1001 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1002
78338f71
JMF
1003 def _og_search_url(self, html, **kargs):
1004 return self._og_search_property('url', html, **kargs)
1005
40c696e5 1006 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1007 if not isinstance(name, (list, tuple)):
1008 name = [name]
59040888 1009 if display_name is None:
88d9f6c0 1010 display_name = name[0]
59040888 1011 return self._html_search_regex(
88d9f6c0 1012 [self._meta_regex(n) for n in name],
711ede6e 1013 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1014
1015 def _dc_search_uploader(self, html):
1016 return self._html_search_meta('dc.creator', html, 'uploader')
1017
8dbe9899
PH
1018 def _rta_search(self, html):
1019 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1020 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1021 r' content="RTA-5042-1996-1400-1577-RTA"',
1022 html):
1023 return 18
1024 return 0
1025
59040888
PH
1026 def _media_rating_search(self, html):
1027 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1028 rating = self._html_search_meta('rating', html)
1029
1030 if not rating:
1031 return None
1032
1033 RATING_TABLE = {
1034 'safe for kids': 0,
1035 'general': 8,
1036 '14 years': 14,
1037 'mature': 17,
1038 'restricted': 19,
1039 }
d800609c 1040 return RATING_TABLE.get(rating.lower())
59040888 1041
69319969 1042 def _family_friendly_search(self, html):
6ca7732d 1043 # See http://schema.org/VideoObject
ac8491fc
S
1044 family_friendly = self._html_search_meta(
1045 'isFamilyFriendly', html, default=None)
69319969
NJ
1046
1047 if not family_friendly:
1048 return None
1049
1050 RATING_TABLE = {
1051 '1': 0,
1052 'true': 0,
1053 '0': 18,
1054 'false': 18,
1055 }
d800609c 1056 return RATING_TABLE.get(family_friendly.lower())
69319969 1057
0c708f11
JMF
1058 def _twitter_search_player(self, html):
1059 return self._html_search_meta('twitter:player', html,
9e1a5b84 1060 'twitter card player')
0c708f11 1061
95b31e26 1062 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4ca2a3cf
S
1063 json_ld = self._search_regex(
1064 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 1065 html, 'JSON-LD', group='json_ld', **kwargs)
321b5e08 1066 default = kwargs.get('default', NO_DEFAULT)
4ca2a3cf 1067 if not json_ld:
321b5e08
S
1068 return default if default is not NO_DEFAULT else {}
1069 # JSON-LD may be malformed and thus `fatal` should be respected.
1070 # At the same time `default` may be passed that assumes `fatal=False`
1071 # for _search_regex. Let's simulate the same behavior here as well.
1072 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1073 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
4ca2a3cf 1074
95b31e26 1075 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1076 if isinstance(json_ld, compat_str):
1077 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1078 if not json_ld:
1079 return {}
1080 info = {}
46933a15
S
1081 if not isinstance(json_ld, (list, tuple, dict)):
1082 return info
1083 if isinstance(json_ld, dict):
1084 json_ld = [json_ld]
bae14048 1085
e7e4a6e0
S
1086 INTERACTION_TYPE_MAP = {
1087 'CommentAction': 'comment',
1088 'AgreeAction': 'like',
1089 'DisagreeAction': 'dislike',
1090 'LikeAction': 'like',
1091 'DislikeAction': 'dislike',
1092 'ListenAction': 'view',
1093 'WatchAction': 'view',
1094 'ViewAction': 'view',
1095 }
1096
1097 def extract_interaction_statistic(e):
1098 interaction_statistic = e.get('interactionStatistic')
1099 if not isinstance(interaction_statistic, list):
1100 return
1101 for is_e in interaction_statistic:
1102 if not isinstance(is_e, dict):
1103 continue
1104 if is_e.get('@type') != 'InteractionCounter':
1105 continue
1106 interaction_type = is_e.get('interactionType')
1107 if not isinstance(interaction_type, compat_str):
1108 continue
1109 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1110 if interaction_count is None:
1111 continue
1112 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1113 if not count_kind:
1114 continue
1115 count_key = '%s_count' % count_kind
1116 if info.get(count_key) is not None:
1117 continue
1118 info[count_key] = interaction_count
1119
bae14048
S
1120 def extract_video_object(e):
1121 assert e['@type'] == 'VideoObject'
1122 info.update({
1123 'url': e.get('contentUrl'),
1124 'title': unescapeHTML(e.get('name')),
1125 'description': unescapeHTML(e.get('description')),
1126 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1127 'duration': parse_duration(e.get('duration')),
1128 'timestamp': unified_timestamp(e.get('uploadDate')),
1129 'filesize': float_or_none(e.get('contentSize')),
1130 'tbr': int_or_none(e.get('bitrate')),
1131 'width': int_or_none(e.get('width')),
1132 'height': int_or_none(e.get('height')),
33a81c2c 1133 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1134 })
e7e4a6e0 1135 extract_interaction_statistic(e)
bae14048 1136
46933a15 1137 for e in json_ld:
66b68672 1138 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
46933a15
S
1139 item_type = e.get('@type')
1140 if expected_type is not None and expected_type != item_type:
1141 return info
c69701c6 1142 if item_type in ('TVEpisode', 'Episode'):
46933a15
S
1143 info.update({
1144 'episode': unescapeHTML(e.get('name')),
1145 'episode_number': int_or_none(e.get('episodeNumber')),
1146 'description': unescapeHTML(e.get('description')),
1147 })
1148 part_of_season = e.get('partOfSeason')
c69701c6 1149 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
46933a15 1150 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
d16b3c66 1151 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1152 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1153 info['series'] = unescapeHTML(part_of_series.get('name'))
3931b845 1154 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1155 info.update({
1156 'timestamp': parse_iso8601(e.get('datePublished')),
1157 'title': unescapeHTML(e.get('headline')),
1158 'description': unescapeHTML(e.get('articleBody')),
1159 })
1160 elif item_type == 'VideoObject':
bae14048 1161 extract_video_object(e)
c69701c6
S
1162 continue
1163 video = e.get('video')
1164 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1165 extract_video_object(video)
46933a15 1166 break
4ca2a3cf
S
1167 return dict((k, v) for k, v in info.items() if v is not None)
1168
27713812 1169 @staticmethod
f8da79f8 1170 def _hidden_inputs(html):
586f1cc5 1171 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1172 hidden_inputs = {}
c8498368
S
1173 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1174 attrs = extract_attributes(input)
1175 if not input:
201ea3ee 1176 continue
c8498368 1177 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1178 continue
c8498368
S
1179 name = attrs.get('name') or attrs.get('id')
1180 value = attrs.get('value')
1181 if name and value is not None:
1182 hidden_inputs[name] = value
201ea3ee 1183 return hidden_inputs
27713812 1184
cf61d96d
S
1185 def _form_hidden_inputs(self, form_id, html):
1186 form = self._search_regex(
73eb13df 1187 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1188 html, '%s form' % form_id, group='form')
1189 return self._hidden_inputs(form)
1190
3ded7bac 1191 def _sort_formats(self, formats, field_preference=None):
7e8caf30 1192 if not formats:
f1a9d64e 1193 raise ExtractorError('No video formats found')
7e8caf30 1194
b0d21ded
S
1195 for f in formats:
1196 # Automatically determine tbr when missing based on abr and vbr (improves
1197 # formats sorting in some cases)
350cf045 1198 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
1199 f['tbr'] = f['abr'] + f['vbr']
1200
4bcc7bd1 1201 def _formats_key(f):
e6812ac9
PH
1202 # TODO remove the following workaround
1203 from ..utils import determine_ext
1204 if not f.get('ext') and 'url' in f:
1205 f['ext'] = determine_ext(f['url'])
1206
3ded7bac 1207 if isinstance(field_preference, (list, tuple)):
bf8dd790
S
1208 return tuple(
1209 f.get(field)
1210 if f.get(field) is not None
1211 else ('' if field == 'format_id' else -1)
1212 for field in field_preference)
3ded7bac 1213
4bcc7bd1
PH
1214 preference = f.get('preference')
1215 if preference is None:
d497a201 1216 preference = 0
4bcc7bd1
PH
1217 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1218 preference -= 0.5
1219
8b408545
RA
1220 protocol = f.get('protocol') or determine_protocol(f)
1221 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
d497a201 1222
4bcc7bd1 1223 if f.get('vcodec') == 'none': # audio only
dd867805 1224 preference -= 50
4bcc7bd1 1225 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1226 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 1227 else:
f1a9d64e 1228 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
1229 ext_preference = 0
1230 try:
1231 audio_ext_preference = ORDER.index(f['ext'])
1232 except ValueError:
1233 audio_ext_preference = -1
1234 else:
dd867805 1235 if f.get('acodec') == 'none': # video only
1236 preference -= 40
4bcc7bd1 1237 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1238 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 1239 else:
f1a9d64e 1240 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
1241 try:
1242 ext_preference = ORDER.index(f['ext'])
1243 except ValueError:
1244 ext_preference = -1
1245 audio_ext_preference = 0
1246
1247 return (
1248 preference,
aff2f4f4 1249 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 1250 f.get('quality') if f.get('quality') is not None else -1,
9933b574 1251 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 1252 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 1253 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
1254 f.get('height') if f.get('height') is not None else -1,
1255 f.get('width') if f.get('width') is not None else -1,
d497a201 1256 proto_preference,
1e1896f2 1257 ext_preference,
4bcc7bd1
PH
1258 f.get('abr') if f.get('abr') is not None else -1,
1259 audio_ext_preference,
2c8e03d9 1260 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 1261 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 1262 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 1263 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
1264 )
1265 formats.sort(key=_formats_key)
59040888 1266
96a53167
S
1267 def _check_formats(self, formats, video_id):
1268 if formats:
1269 formats[:] = filter(
1270 lambda f: self._is_valid_url(
1271 f['url'], video_id,
1272 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1273 formats)
1274
f5bdb444
S
1275 @staticmethod
1276 def _remove_duplicate_formats(formats):
1277 format_urls = set()
1278 unique_formats = []
1279 for f in formats:
1280 if f['url'] not in format_urls:
1281 format_urls.add(f['url'])
1282 unique_formats.append(f)
1283 formats[:] = unique_formats
1284
45024183 1285 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1286 url = self._proto_relative_url(url, scheme='http:')
1287 # For now assume non HTTP(S) URLs always valid
1288 if not (url.startswith('http://') or url.startswith('https://')):
1289 return True
96a53167 1290 try:
45024183 1291 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167
S
1292 return True
1293 except ExtractorError as e:
943a1e24 1294 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
1295 self.to_screen(
1296 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
1297 return False
1298 raise
1299
20991253 1300 def http_scheme(self):
1ede5b24 1301 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1302 return (
1303 'http:'
1304 if self._downloader.params.get('prefer_insecure', False)
1305 else 'https:')
1306
57c7411f
PH
1307 def _proto_relative_url(self, url, scheme=None):
1308 if url is None:
1309 return url
1310 if url.startswith('//'):
1311 if scheme is None:
1312 scheme = self.http_scheme()
1313 return scheme + url
1314 else:
1315 return url
1316
4094b6e3
PH
1317 def _sleep(self, timeout, video_id, msg_template=None):
1318 if msg_template is None:
f1a9d64e 1319 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1320 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1321 self.to_screen(msg)
1322 time.sleep(timeout)
1323
a38436e8 1324 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310 1325 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1326 fatal=True, m3u8_id=None):
f036a632
JMF
1327 manifest = self._download_xml(
1328 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1329 'Unable to download f4m manifest',
1330 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1331 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
1332 transform_source=transform_source,
1333 fatal=fatal)
1334
1335 if manifest is False:
8d29e47f 1336 return []
31bb8d3f 1337
0fdbb332
S
1338 return self._parse_f4m_formats(
1339 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
448bb5f3 1340 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332
S
1341
1342 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1343 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1344 fatal=True, m3u8_id=None):
fb72ec58 1345 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1346 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1347 if akamai_pv is not None and ';' in akamai_pv.text:
1348 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1349 if playerVerificationChallenge.strip() != '':
1350 return []
1351
31bb8d3f 1352 formats = []
7a47d07c 1353 manifest_version = '1.0'
b2527359 1354 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1355 if not media_nodes:
7a47d07c 1356 manifest_version = '2.0'
34e48bed 1357 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762
S
1358 # Remove unsupported DRM protected media from final formats
1359 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1360 media_nodes = remove_encrypted_media(media_nodes)
1361 if not media_nodes:
1362 return formats
48107c19
S
1363
1364 manifest_base_url = get_base_url(manifest)
0a5685b2 1365
a6571f10 1366 bootstrap_info = xpath_element(
0a5685b2
YCH
1367 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1368 'bootstrap info', default=None)
1369
edd6074c
RA
1370 vcodec = None
1371 mime_type = xpath_text(
1372 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1373 'base URL', default=None)
1374 if mime_type and mime_type.startswith('audio/'):
1375 vcodec = 'none'
1376
b2527359 1377 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1378 tbr = int_or_none(media_el.attrib.get('bitrate'))
1379 width = int_or_none(media_el.attrib.get('width'))
1380 height = int_or_none(media_el.attrib.get('height'))
1381 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1382 # If <bootstrapInfo> is present, the specified f4m is a
1383 # stream-level manifest, and only set-level manifests may refer to
1384 # external resources. See section 11.4 and section 4 of F4M spec
1385 if bootstrap_info is None:
1386 media_url = None
1387 # @href is introduced in 2.0, see section 11.6 of F4M spec
1388 if manifest_version == '2.0':
1389 media_url = media_el.attrib.get('href')
1390 if media_url is None:
1391 media_url = media_el.attrib.get('url')
31c746e5
S
1392 if not media_url:
1393 continue
cc357c4d
S
1394 manifest_url = (
1395 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1396 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1397 # If media_url is itself a f4m manifest do the recursive extraction
1398 # since bitrates in parent manifest (this one) and media_url manifest
1399 # may differ leading to inability to resolve the format by requested
1400 # bitrate in f4m downloader
240b6045
YCH
1401 ext = determine_ext(manifest_url)
1402 if ext == 'f4m':
77b8b4e6 1403 f4m_formats = self._extract_f4m_formats(
0fdbb332 1404 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
77b8b4e6
S
1405 transform_source=transform_source, fatal=fatal)
1406 # Sometimes stream-level manifest contains single media entry that
1407 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1408 # At the same time parent's media entry in set-level manifest may
1409 # contain it. We will copy it from parent in such cases.
1410 if len(f4m_formats) == 1:
1411 f = f4m_formats[0]
1412 f.update({
1413 'tbr': f.get('tbr') or tbr,
1414 'width': f.get('width') or width,
1415 'height': f.get('height') or height,
1416 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1417 'vcodec': vcodec,
77b8b4e6
S
1418 })
1419 formats.extend(f4m_formats)
70f0f5a8 1420 continue
240b6045
YCH
1421 elif ext == 'm3u8':
1422 formats.extend(self._extract_m3u8_formats(
1423 manifest_url, video_id, 'mp4', preference=preference,
fac2af3c 1424 m3u8_id=m3u8_id, fatal=fatal))
240b6045 1425 continue
31bb8d3f 1426 formats.append({
77b8b4e6 1427 'format_id': format_id,
31bb8d3f 1428 'url': manifest_url,
30d0b549 1429 'manifest_url': manifest_url,
a6571f10 1430 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1431 'protocol': 'f4m',
b2527359 1432 'tbr': tbr,
77b8b4e6
S
1433 'width': width,
1434 'height': height,
edd6074c 1435 'vcodec': vcodec,
60ca389c 1436 'preference': preference,
31bb8d3f 1437 })
31bb8d3f
JMF
1438 return formats
1439
16da9bbc
YCH
1440 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1441 return {
f207019c 1442 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1443 'url': m3u8_url,
1444 'ext': ext,
1445 'protocol': 'm3u8',
37768f92 1446 'preference': preference - 100 if preference else -100,
704df56d
PH
1447 'resolution': 'multiple',
1448 'format_note': 'Quality selection URL',
16da9bbc
YCH
1449 }
1450
1451 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1452 entry_protocol='m3u8', preference=None,
1453 m3u8_id=None, note=None, errnote=None,
1454 fatal=True, live=False):
dbd82a1d 1455 res = self._download_webpage_handle(
81515ad9 1456 m3u8_url, video_id,
621ed9f5 1457 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1458 errnote=errnote or 'Failed to download m3u8 information',
1459 fatal=fatal)
cb252080 1460
dbd82a1d 1461 if res is False:
8d29e47f 1462 return []
cb252080 1463
dbd82a1d 1464 m3u8_doc, urlh = res
37113045 1465 m3u8_url = urlh.geturl()
9cdffeeb 1466
cb252080
S
1467 return self._parse_m3u8_formats(
1468 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1469 preference=preference, m3u8_id=m3u8_id, live=live)
1470
1471 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1472 entry_protocol='m3u8', preference=None,
1473 m3u8_id=None, live=False):
08a00eef
RA
1474 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1475 return []
1476
ea229584
RA
1477 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1478 return []
1479
ff99fe52 1480 formats = []
0def7587
RA
1481
1482 format_url = lambda u: (
1483 u
1484 if re.match(r'^https?://', u)
1485 else compat_urlparse.urljoin(m3u8_url, u))
1486
cb252080
S
1487 # References:
1488 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1489 # 2. https://github.com/rg3/youtube-dl/issues/12211
1490
1491 # We should try extracting formats only from master playlists [1, 4.3.4],
1492 # i.e. playlists that describe available qualities. On the other hand
1493 # media playlists [1, 4.3.3] should be returned as is since they contain
1494 # just the media without qualities renditions.
9cdffeeb 1495 # Fortunately, master playlist can be easily distinguished from media
cb252080
S
1496 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1497 # master playlist tags MUST NOT appear in a media playist and vice versa.
1498 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1499 # media playlist and MUST NOT appear in master playlist thus we can
1500 # clearly detect media playlist with this criterion.
1501
9cdffeeb 1502 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1503 return [{
1504 'url': m3u8_url,
1505 'format_id': m3u8_id,
1506 'ext': ext,
1507 'protocol': entry_protocol,
1508 'preference': preference,
1509 }]
cb252080
S
1510
1511 groups = {}
1512 last_stream_inf = {}
1513
1514 def extract_media(x_media_line):
1515 media = parse_m3u8_attributes(x_media_line)
1516 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1517 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1518 if not (media_type and group_id and name):
1519 return
1520 groups.setdefault(group_id, []).append(media)
1521 if media_type not in ('VIDEO', 'AUDIO'):
1522 return
1523 media_url = media.get('URI')
1524 if media_url:
1525 format_id = []
9211e331 1526 for v in (m3u8_id, group_id, name):
cb252080
S
1527 if v:
1528 format_id.append(v)
1529 f = {
1530 'format_id': '-'.join(format_id),
1531 'url': format_url(media_url),
c89b49f7 1532 'manifest_url': m3u8_url,
cb252080
S
1533 'language': media.get('LANGUAGE'),
1534 'ext': ext,
1535 'protocol': entry_protocol,
1536 'preference': preference,
1537 }
1538 if media_type == 'AUDIO':
1539 f['vcodec'] = 'none'
1540 formats.append(f)
1541
1542 def build_stream_name():
1543 # Despite specification does not mention NAME attribute for
3019cb0c
S
1544 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1545 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 1546 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
1547 stream_name = last_stream_inf.get('NAME')
1548 if stream_name:
1549 return stream_name
1550 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1551 # from corresponding rendition group
1552 stream_group_id = last_stream_inf.get('VIDEO')
1553 if not stream_group_id:
1554 return
1555 stream_group = groups.get(stream_group_id)
1556 if not stream_group:
1557 return stream_group_id
1558 rendition = stream_group[0]
1559 return rendition.get('NAME') or stream_group_id
1560
704df56d
PH
1561 for line in m3u8_doc.splitlines():
1562 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 1563 last_stream_inf = parse_m3u8_attributes(line)
4cd95bcb 1564 elif line.startswith('#EXT-X-MEDIA:'):
cb252080 1565 extract_media(line)
704df56d
PH
1566 elif line.startswith('#') or not line.strip():
1567 continue
1568 else:
9c99bef7
S
1569 tbr = float_or_none(
1570 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1571 last_stream_inf.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1572 format_id = []
1573 if m3u8_id:
1574 format_id.append(m3u8_id)
cb252080 1575 stream_name = build_stream_name()
b24d6336
KH
1576 # Bandwidth of live streams may differ over time thus making
1577 # format_id unpredictable. So it's better to keep provided
1578 # format_id intact.
e9c6cdf4 1579 if not live:
ed56f260 1580 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
30d0b549 1581 manifest_url = format_url(line.strip())
704df56d 1582 f = {
8dc9d361 1583 'format_id': '-'.join(format_id),
30d0b549 1584 'url': manifest_url,
ff99fe52 1585 'manifest_url': m3u8_url,
704df56d
PH
1586 'tbr': tbr,
1587 'ext': ext,
cb252080 1588 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
f0b5d6af
PH
1589 'protocol': entry_protocol,
1590 'preference': preference,
704df56d 1591 }
cb252080 1592 resolution = last_stream_inf.get('RESOLUTION')
704df56d 1593 if resolution:
c4c9b844
S
1594 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1595 if mobj:
1596 f['width'] = int(mobj.group('width'))
1597 f['height'] = int(mobj.group('height'))
00f4764c
RA
1598 # Unified Streaming Platform
1599 mobj = re.search(
1600 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1601 if mobj:
1602 abr, vbr = mobj.groups()
1603 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
fbb6edd2 1604 f.update({
00f4764c
RA
1605 'vbr': vbr,
1606 'abr': abr,
fbb6edd2 1607 })
cb252080
S
1608 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1609 f.update(codecs)
1610 audio_group_id = last_stream_inf.get('AUDIO')
1611 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1612 # references a rendition group MUST have a CODECS attribute.
1613 # However, this is not always respected, for example, [2]
1614 # contains EXT-X-STREAM-INF tag which references AUDIO
1615 # rendition group but does not have CODECS and despite
1616 # referencing audio group an audio group, it represents
1617 # a complete (with audio and video) format. So, for such cases
1618 # we will ignore references to rendition groups and treat them
1619 # as complete formats.
1620 if audio_group_id and codecs and f.get('vcodec') != 'none':
1621 audio_group = groups.get(audio_group_id)
1622 if audio_group and audio_group[0].get('URI'):
1623 # TODO: update acodec for audio only formats with
1624 # the same GROUP-ID
1625 f['acodec'] = 'none'
704df56d 1626 formats.append(f)
cb252080 1627 last_stream_inf = {}
704df56d
PH
1628 return formats
1629
a107193e
S
1630 @staticmethod
1631 def _xpath_ns(path, namespace=None):
1632 if not namespace:
1633 return path
1634 out = []
1635 for c in path.split('/'):
1636 if not c or c == '.':
1637 out.append(c)
1638 else:
1639 out.append('{%s}%s' % (namespace, c))
1640 return '/'.join(out)
1641
09f572fb 1642 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1643 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1644
995029a1
PH
1645 if smil is False:
1646 assert not fatal
1647 return []
e89a2aab 1648
17712eeb 1649 namespace = self._parse_smil_namespace(smil)
a107193e
S
1650
1651 return self._parse_smil_formats(
1652 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1653
1654 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1655 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1656 if smil is False:
1657 return {}
1658 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1659
09f572fb 1660 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1661 return self._download_xml(
1662 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1663 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1664
1665 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1666 namespace = self._parse_smil_namespace(smil)
a107193e
S
1667
1668 formats = self._parse_smil_formats(
1669 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1670 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1671
1672 video_id = os.path.splitext(url_basename(smil_url))[0]
1673 title = None
1674 description = None
647eab45 1675 upload_date = None
a107193e
S
1676 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1677 name = meta.attrib.get('name')
1678 content = meta.attrib.get('content')
1679 if not name or not content:
1680 continue
1681 if not title and name == 'title':
1682 title = content
1683 elif not description and name in ('description', 'abstract'):
1684 description = content
647eab45
S
1685 elif not upload_date and name == 'date':
1686 upload_date = unified_strdate(content)
a107193e 1687
1e5bcdec
S
1688 thumbnails = [{
1689 'id': image.get('type'),
1690 'url': image.get('src'),
1691 'width': int_or_none(image.get('width')),
1692 'height': int_or_none(image.get('height')),
1693 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1694
a107193e
S
1695 return {
1696 'id': video_id,
1697 'title': title or video_id,
1698 'description': description,
647eab45 1699 'upload_date': upload_date,
1e5bcdec 1700 'thumbnails': thumbnails,
a107193e
S
1701 'formats': formats,
1702 'subtitles': subtitles,
1703 }
1704
17712eeb
S
1705 def _parse_smil_namespace(self, smil):
1706 return self._search_regex(
1707 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1708
f877c6ae 1709 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1710 base = smil_url
1711 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1712 b = meta.get('base') or meta.get('httpBase')
1713 if b:
1714 base = b
1715 break
e89a2aab
S
1716
1717 formats = []
1718 rtmp_count = 0
a107193e 1719 http_count = 0
7f32e5dc 1720 m3u8_count = 0
a107193e 1721
81e1c4e2 1722 srcs = []
ad96b4c8
YCH
1723 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1724 for medium in media:
1725 src = medium.get('src')
81e1c4e2 1726 if not src or src in srcs:
a107193e 1727 continue
81e1c4e2 1728 srcs.append(src)
a107193e 1729
ad96b4c8
YCH
1730 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1731 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1732 width = int_or_none(medium.get('width'))
1733 height = int_or_none(medium.get('height'))
1734 proto = medium.get('proto')
1735 ext = medium.get('ext')
a107193e 1736 src_ext = determine_ext(src)
ad96b4c8 1737 streamer = medium.get('streamer') or base
a107193e
S
1738
1739 if proto == 'rtmp' or streamer.startswith('rtmp'):
1740 rtmp_count += 1
1741 formats.append({
1742 'url': streamer,
1743 'play_path': src,
1744 'ext': 'flv',
1745 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1746 'tbr': bitrate,
1747 'filesize': filesize,
1748 'width': width,
1749 'height': height,
1750 })
f877c6ae
YCH
1751 if transform_rtmp_url:
1752 streamer, src = transform_rtmp_url(streamer, src)
1753 formats[-1].update({
1754 'url': streamer,
1755 'play_path': src,
1756 })
a107193e
S
1757 continue
1758
1759 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1760 src_url = src_url.strip()
a107193e
S
1761
1762 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1763 m3u8_formats = self._extract_m3u8_formats(
1764 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1765 if len(m3u8_formats) == 1:
1766 m3u8_count += 1
1767 m3u8_formats[0].update({
1768 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1769 'tbr': bitrate,
1770 'width': width,
1771 'height': height,
1772 })
1773 formats.extend(m3u8_formats)
a107193e
S
1774 continue
1775
1776 if src_ext == 'f4m':
1777 f4m_url = src_url
1778 if not f4m_params:
1779 f4m_params = {
1780 'hdcore': '3.2.0',
1781 'plugin': 'flowplayer-3.2.0.1',
1782 }
1783 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 1784 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 1785 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1786 continue
1787
c78e4817 1788 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1789 http_count += 1
1790 formats.append({
1791 'url': src_url,
1792 'ext': ext or src_ext or 'flv',
1793 'format_id': 'http-%d' % (bitrate or http_count),
1794 'tbr': bitrate,
1795 'filesize': filesize,
1796 'width': width,
1797 'height': height,
1798 })
1799 continue
63757032 1800
e89a2aab
S
1801 return formats
1802
ce00af87 1803 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1804 urls = []
a107193e
S
1805 subtitles = {}
1806 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1807 src = textstream.get('src')
d413095f 1808 if not src or src in urls:
a107193e 1809 continue
d413095f 1810 urls.append(src)
df634be2 1811 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 1812 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1813 subtitles.setdefault(lang, []).append({
1814 'url': src,
1815 'ext': ext,
1816 })
1817 return subtitles
63757032 1818
47a5cb77 1819 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 1820 xspf = self._download_xml(
47a5cb77 1821 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1822 'Unable to download xspf manifest', fatal=fatal)
1823 if xspf is False:
1824 return []
47a5cb77
S
1825 return self._parse_xspf(
1826 xspf, playlist_id, xspf_url=xspf_url,
1827 xspf_base_url=base_url(xspf_url))
8d6765cf 1828
47a5cb77 1829 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
1830 NS_MAP = {
1831 'xspf': 'http://xspf.org/ns/0/',
1832 's1': 'http://static.streamone.nl/player/ns/0',
1833 }
1834
1835 entries = []
47a5cb77 1836 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 1837 title = xpath_text(
98044462 1838 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1839 description = xpath_text(
1840 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1841 thumbnail = xpath_text(
1842 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1843 duration = float_or_none(
1844 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1845
47a5cb77
S
1846 formats = []
1847 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1848 format_url = urljoin(xspf_base_url, location.text)
1849 if not format_url:
1850 continue
1851 formats.append({
1852 'url': format_url,
1853 'manifest_url': xspf_url,
1854 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1855 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1856 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1857 })
8d6765cf
S
1858 self._sort_formats(formats)
1859
1860 entries.append({
1861 'id': playlist_id,
1862 'title': title,
1863 'description': description,
1864 'thumbnail': thumbnail,
1865 'duration': duration,
1866 'formats': formats,
1867 })
1868 return entries
1869
1bac3455 1870 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
47a5cb77 1871 res = self._download_xml_handle(
1bac3455 1872 mpd_url, video_id,
1873 note=note or 'Downloading MPD manifest',
1874 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1875 fatal=fatal)
1bac3455 1876 if res is False:
2d2fa82d 1877 return []
47a5cb77 1878 mpd_doc, urlh = res
02dc0a36 1879 mpd_base_url = base_url(urlh.geturl())
1bac3455 1880
91cb6b50 1881 return self._parse_mpd_formats(
47a5cb77 1882 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
86f4d14f 1883 formats_dict=formats_dict, mpd_url=mpd_url)
2d2fa82d 1884
86f4d14f 1885 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
f0948348
S
1886 """
1887 Parse formats from MPD manifest.
1888 References:
1889 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1890 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1891 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1892 """
1bac3455 1893 if mpd_doc.get('type') == 'dynamic':
1894 return []
2d2fa82d 1895
91cb6b50 1896 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1897
1898 def _add_ns(path):
1899 return self._xpath_ns(path, namespace)
1900
675d0016 1901 def is_drm_protected(element):
1902 return element.find(_add_ns('ContentProtection')) is not None
1903
1bac3455 1904 def extract_multisegment_info(element, ms_parent_info):
1905 ms_info = ms_parent_info.copy()
b4c1d6e8
S
1906
1907 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1908 # common attributes and elements. We will only extract relevant
1909 # for us.
1910 def extract_common(source):
1911 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1912 if segment_timeline is not None:
1913 s_e = segment_timeline.findall(_add_ns('S'))
1914 if s_e:
1915 ms_info['total_number'] = 0
1916 ms_info['s'] = []
1917 for s in s_e:
1918 r = int(s.get('r', 0))
1919 ms_info['total_number'] += 1 + r
1920 ms_info['s'].append({
1921 't': int(s.get('t', 0)),
1922 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1923 'd': int(s.attrib['d']),
1924 'r': r,
1925 })
1926 start_number = source.get('startNumber')
1927 if start_number:
1928 ms_info['start_number'] = int(start_number)
1929 timescale = source.get('timescale')
1930 if timescale:
1931 ms_info['timescale'] = int(timescale)
1932 segment_duration = source.get('duration')
1933 if segment_duration:
48504785 1934 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
1935
1936 def extract_Initialization(source):
1937 initialization = source.find(_add_ns('Initialization'))
1938 if initialization is not None:
1939 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1940
f14be228 1941 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1942 if segment_list is not None:
b4c1d6e8
S
1943 extract_common(segment_list)
1944 extract_Initialization(segment_list)
f14be228 1945 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1946 if segment_urls_e:
1947 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 1948 else:
f14be228 1949 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1950 if segment_template is not None:
b4c1d6e8 1951 extract_common(segment_template)
e228616c
S
1952 media = segment_template.get('media')
1953 if media:
1954 ms_info['media'] = media
1bac3455 1955 initialization = segment_template.get('initialization')
1956 if initialization:
e228616c 1957 ms_info['initialization'] = initialization
1bac3455 1958 else:
b4c1d6e8 1959 extract_Initialization(segment_template)
1bac3455 1960 return ms_info
b323e170 1961
1bac3455 1962 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1963 formats = []
f14be228 1964 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1965 period_duration = parse_duration(period.get('duration')) or mpd_duration
1966 period_ms_info = extract_multisegment_info(period, {
1967 'start_number': 1,
1968 'timescale': 1,
1969 })
f14be228 1970 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1971 if is_drm_protected(adaptation_set):
1972 continue
1bac3455 1973 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1974 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1975 if is_drm_protected(representation):
1976 continue
1bac3455 1977 representation_attrib = adaptation_set.attrib.copy()
1978 representation_attrib.update(representation.attrib)
f0948348 1979 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
1980 mime_type = representation_attrib['mimeType']
1981 content_type = mime_type.split('/')[0]
1bac3455 1982 if content_type == 'text':
1983 # TODO implement WebVTT downloading
1984 pass
40fcba5e 1985 elif content_type in ('video', 'audio'):
1bac3455 1986 base_url = ''
1987 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1988 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1989 if base_url_e is not None:
1990 base_url = base_url_e.text + base_url
1991 if re.match(r'^https?://', base_url):
1992 break
bb20526b
S
1993 if mpd_base_url and not re.match(r'^https?://', base_url):
1994 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1995 mpd_base_url += '/'
1bac3455 1996 base_url = mpd_base_url + base_url
1997 representation_id = representation_attrib.get('id')
d577c796 1998 lang = representation_attrib.get('lang')
51e9094f 1999 url_el = representation.find(_add_ns('BaseURL'))
2000 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2001 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 2002 f = {
154c209e 2003 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 2004 'url': base_url,
86f4d14f 2005 'manifest_url': mpd_url,
a6c8b759 2006 'ext': mimetype2ext(mime_type),
1bac3455 2007 'width': int_or_none(representation_attrib.get('width')),
2008 'height': int_or_none(representation_attrib.get('height')),
9c99bef7 2009 'tbr': float_or_none(bandwidth, 1000),
1bac3455 2010 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2011 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 2012 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 2013 'format_note': 'DASH %s' % content_type,
51e9094f 2014 'filesize': filesize,
126f225b 2015 'container': mimetype2ext(mime_type) + '_dash',
1bac3455 2016 }
7fe15920 2017 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 2018 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2019
e228616c
S
2020 def prepare_template(template_name, identifiers):
2021 t = representation_ms_info[template_name]
2022 t = t.replace('$RepresentationID$', representation_id)
2023 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2024 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2025 t.replace('$$', '$')
2026 return t
2027
2028 # @initialization is a regular template like @media one
2029 # so it should be handled just the same way (see
2030 # https://github.com/rg3/youtube-dl/issues/11605)
2031 if 'initialization' in representation_ms_info:
2032 initialization_template = prepare_template(
2033 'initialization',
2034 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2035 # $Time$ shall not be included for @initialization thus
2036 # only $Bandwidth$ remains
2037 ('Bandwidth', ))
2038 representation_ms_info['initialization_url'] = initialization_template % {
2039 'Bandwidth': bandwidth,
2040 }
2041
1141e910
S
2042 def location_key(location):
2043 return 'url' if re.match(r'^https?://', location) else 'path'
2044
e228616c
S
2045 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2046
2047 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2048 media_location_key = location_key(media_template)
f0948348
S
2049
2050 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2051 # can't be used at the same time
b4c1d6e8
S
2052 if '%(Number' in media_template and 's' not in representation_ms_info:
2053 segment_duration = None
c110944f 2054 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2055 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2056 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2057 representation_ms_info['fragments'] = [{
1141e910 2058 media_location_key: media_template % {
b4c1d6e8 2059 'Number': segment_number,
e228616c 2060 'Bandwidth': bandwidth,
b4c1d6e8
S
2061 },
2062 'duration': segment_duration,
2063 } for segment_number in range(
2064 representation_ms_info['start_number'],
2065 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2066 else:
b4c1d6e8
S
2067 # $Number*$ or $Time$ in media template with S list available
2068 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2069 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2070 representation_ms_info['fragments'] = []
f0948348 2071 segment_time = 0
b4c1d6e8
S
2072 segment_d = None
2073 segment_number = representation_ms_info['start_number']
f0948348
S
2074
2075 def add_segment_url():
b4c1d6e8
S
2076 segment_url = media_template % {
2077 'Time': segment_time,
e228616c 2078 'Bandwidth': bandwidth,
b4c1d6e8
S
2079 'Number': segment_number,
2080 }
b4c1d6e8 2081 representation_ms_info['fragments'].append({
1141e910 2082 media_location_key: segment_url,
b4c1d6e8
S
2083 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2084 })
f0948348
S
2085
2086 for num, s in enumerate(representation_ms_info['s']):
2087 segment_time = s.get('t') or segment_time
b4c1d6e8 2088 segment_d = s['d']
f0948348 2089 add_segment_url()
b4c1d6e8 2090 segment_number += 1
f0948348 2091 for r in range(s.get('r', 0)):
b4c1d6e8 2092 segment_time += segment_d
f0948348 2093 add_segment_url()
b4c1d6e8
S
2094 segment_number += 1
2095 segment_time += segment_d
2096 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2097 # No media template
2098 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2099 # or any YouTube dashsegments video
2100 fragments = []
d04621da
S
2101 segment_index = 0
2102 timescale = representation_ms_info['timescale']
2103 for s in representation_ms_info['s']:
2104 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2105 for r in range(s.get('r', 0) + 1):
1141e910 2106 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2107 fragments.append({
1141e910 2108 location_key(segment_uri): segment_uri,
d04621da 2109 'duration': duration,
b4c1d6e8 2110 })
d04621da 2111 segment_index += 1
b4c1d6e8 2112 representation_ms_info['fragments'] = fragments
41bf647e
PN
2113 elif 'segment_urls' in representation_ms_info:
2114 # Segment URLs with no SegmentTimeline
2115 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
78593e29 2116 # https://github.com/rg3/youtube-dl/pull/14844
41bf647e 2117 fragments = []
603fc4e0
S
2118 segment_duration = float_or_none(
2119 representation_ms_info['segment_duration'],
2120 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2121 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2122 fragment = {
41bf647e 2123 location_key(segment_url): segment_url,
603fc4e0
S
2124 }
2125 if segment_duration:
2126 fragment['duration'] = segment_duration
2127 fragments.append(fragment)
41bf647e 2128 representation_ms_info['fragments'] = fragments
86f4d14f
S
2129 # NB: MPD manifest may contain direct URLs to unfragmented media.
2130 # No fragments key is present in this case.
2131 if 'fragments' in representation_ms_info:
1bac3455 2132 f.update({
1141e910 2133 'fragment_base_url': base_url,
b4c1d6e8 2134 'fragments': [],
1bac3455 2135 'protocol': 'http_dash_segments',
df374b52 2136 })
1bac3455 2137 if 'initialization_url' in representation_ms_info:
e228616c 2138 initialization_url = representation_ms_info['initialization_url']
1bac3455 2139 if not f.get('url'):
2140 f['url'] = initialization_url
1141e910 2141 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2142 f['fragments'].extend(representation_ms_info['fragments'])
9d6ac71c
S
2143 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2144 # is not necessarily unique within a Period thus formats with
2145 # the same `format_id` are quite possible. There are numerous examples
2146 # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2147 # https://github.com/rg3/youtube-dl/issues/13919)
2148 full_info = formats_dict.get(representation_id, {}).copy()
2149 full_info.update(f)
2150 formats.append(full_info)
17b598d3 2151 else:
1bac3455 2152 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
2153 return formats
2154
b2758123 2155 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
47a5cb77 2156 res = self._download_xml_handle(
b2758123
RA
2157 ism_url, video_id,
2158 note=note or 'Downloading ISM manifest',
2159 errnote=errnote or 'Failed to download ISM manifest',
2160 fatal=fatal)
2161 if res is False:
2162 return []
47a5cb77 2163 ism_doc, urlh = res
b2758123 2164
47a5cb77 2165 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
b2758123
RA
2166
2167 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2168 """
2169 Parse formats from ISM manifest.
2170 References:
2171 1. [MS-SSTR]: Smooth Streaming Protocol,
2172 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2173 """
b2758123
RA
2174 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2175 return []
2176
b2758123
RA
2177 duration = int(ism_doc.attrib['Duration'])
2178 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2179
2180 formats = []
2181 for stream in ism_doc.findall('StreamIndex'):
2182 stream_type = stream.get('Type')
2183 if stream_type not in ('video', 'audio'):
2184 continue
2185 url_pattern = stream.attrib['Url']
2186 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2187 stream_name = stream.get('Name')
2188 for track in stream.findall('QualityLevel'):
2501d41e 2189 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123
RA
2190 # TODO: add support for WVC1 and WMAP
2191 if fourcc not in ('H264', 'AVC1', 'AACL'):
2192 self.report_warning('%s is not a supported codec' % fourcc)
2193 continue
2194 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2195 # [1] does not mention Width and Height attributes. However,
2196 # they're often present while MaxWidth and MaxHeight are
2197 # missing, so should be used as fallbacks
2198 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2199 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2200 sampling_rate = int_or_none(track.get('SamplingRate'))
2201
2202 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2203 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2204
2205 fragments = []
2206 fragment_ctx = {
2207 'time': 0,
2208 }
2209 stream_fragments = stream.findall('c')
2210 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2211 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2212 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2213 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2214 if not fragment_ctx['duration']:
2215 try:
2216 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2217 except IndexError:
2218 next_fragment_time = duration
1616f9b4 2219 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2220 for _ in range(fragment_repeat):
2221 fragments.append({
1616f9b4 2222 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2223 'duration': fragment_ctx['duration'] / stream_timescale,
2224 })
2225 fragment_ctx['time'] += fragment_ctx['duration']
2226
2227 format_id = []
2228 if ism_id:
2229 format_id.append(ism_id)
2230 if stream_name:
2231 format_id.append(stream_name)
2232 format_id.append(compat_str(tbr))
2233
2234 formats.append({
2235 'format_id': '-'.join(format_id),
2236 'url': ism_url,
2237 'manifest_url': ism_url,
2238 'ext': 'ismv' if stream_type == 'video' else 'isma',
2239 'width': width,
2240 'height': height,
2241 'tbr': tbr,
2242 'asr': sampling_rate,
2243 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2244 'acodec': 'none' if stream_type == 'video' else fourcc,
2245 'protocol': 'ism',
2246 'fragments': fragments,
2247 '_download_params': {
2248 'duration': duration,
2249 'timescale': stream_timescale,
2250 'width': width or 0,
2251 'height': height or 0,
2252 'fourcc': fourcc,
2253 'codec_private_data': track.get('CodecPrivateData'),
2254 'sampling_rate': sampling_rate,
2255 'channels': int_or_none(track.get('Channels', 2)),
2256 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2257 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2258 },
2259 })
2260 return formats
2261
eeb0a956 2262 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
6780154e
S
2263 def absolute_url(item_url):
2264 return urljoin(base_url, item_url)
59bbe491 2265
2266 def parse_content_type(content_type):
2267 if not content_type:
2268 return {}
2269 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2270 if ctr:
2271 mimetype, codecs = ctr.groups()
2272 f = parse_codecs(codecs)
2273 f['ext'] = mimetype2ext(mimetype)
2274 return f
2275 return {}
2276
868f79db 2277 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2278 full_url = absolute_url(src)
82889d4a 2279 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2280 if ext == 'm3u8':
520251c0
YCH
2281 is_plain_url = False
2282 formats = self._extract_m3u8_formats(
ad120ae1 2283 full_url, video_id, ext='mp4',
eeb0a956 2284 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
b359e977 2285 preference=preference, fatal=False)
87a449c1
S
2286 elif ext == 'mpd':
2287 is_plain_url = False
2288 formats = self._extract_mpd_formats(
b359e977 2289 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2290 else:
2291 is_plain_url = True
2292 formats = [{
2293 'url': full_url,
2294 'vcodec': 'none' if cur_media_type == 'audio' else None,
2295 }]
2296 return is_plain_url, formats
2297
59bbe491 2298 entries = []
4328ddf8
S
2299 # amp-video and amp-audio are very similar to their HTML5 counterparts
2300 # so we wll include them right here (see
2301 # https://www.ampproject.org/docs/reference/components/amp-video)
cea364f7
YCH
2302 media_tags = [(media_tag, media_type, '')
2303 for media_tag, media_type
4328ddf8 2304 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2aec7256
S
2305 media_tags.extend(re.findall(
2306 # We only allow video|audio followed by a whitespace or '>'.
2307 # Allowing more characters may end up in significant slow down (see
2308 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2309 # http://www.porntrex.com/maps/videositemap.xml).
4328ddf8 2310 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
cea364f7 2311 for media_tag, media_type, media_content in media_tags:
59bbe491 2312 media_info = {
2313 'formats': [],
2314 'subtitles': {},
2315 }
2316 media_attributes = extract_attributes(media_tag)
2317 src = media_attributes.get('src')
2318 if src:
dedb1770 2319 _, formats = _media_formats(src, media_type)
520251c0 2320 media_info['formats'].extend(formats)
6780154e 2321 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 2322 if media_content:
2323 for source_tag in re.findall(r'<source[^>]+>', media_content):
2324 source_attributes = extract_attributes(source_tag)
2325 src = source_attributes.get('src')
2326 if not src:
2327 continue
82889d4a 2328 f = parse_content_type(source_attributes.get('type'))
868f79db 2329 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 2330 if is_plain_url:
dd121cc1
S
2331 # res attribute is not standard but seen several times
2332 # in the wild
1ed45499
S
2333 f.update({
2334 'height': int_or_none(source_attributes.get('res')),
2335 'format_id': source_attributes.get('label'),
2336 })
520251c0
YCH
2337 f.update(formats[0])
2338 media_info['formats'].append(f)
2339 else:
2340 media_info['formats'].extend(formats)
59bbe491 2341 for track_tag in re.findall(r'<track[^>]+>', media_content):
2342 track_attributes = extract_attributes(track_tag)
2343 kind = track_attributes.get('kind')
5968d7d2 2344 if not kind or kind in ('subtitles', 'captions'):
59bbe491 2345 src = track_attributes.get('src')
2346 if not src:
2347 continue
2348 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2349 media_info['subtitles'].setdefault(lang, []).append({
2350 'url': absolute_url(src),
2351 })
5968d7d2 2352 if media_info['formats'] or media_info['subtitles']:
59bbe491 2353 entries.append(media_info)
2354 return entries
2355
c4251b9a 2356 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
c7c43a93 2357 formats = []
e71a4509 2358 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 2359 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
2360 hds_host = hosts.get('hds')
2361 if hds_host:
2362 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2363 if 'hdcore=' not in f4m_url:
2364 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2365 f4m_formats = self._extract_f4m_formats(
2366 f4m_url, video_id, f4m_id='hds', fatal=False)
2367 for entry in f4m_formats:
2368 entry.update({'extra_param_to_segment_url': hdcore_sign})
2369 formats.extend(f4m_formats)
c4251b9a
RA
2370 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2371 hls_host = hosts.get('hls')
2372 if hls_host:
2373 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
c7c43a93
RA
2374 formats.extend(self._extract_m3u8_formats(
2375 m3u8_url, video_id, 'mp4', 'm3u8_native',
2376 m3u8_id='hls', fatal=False))
2377 return formats
2378
6ad02195 2379 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 2380 query = compat_urlparse.urlparse(url).query
6ad02195 2381 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
2382 mobj = re.search(
2383 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2384 url_base = mobj.group('url')
2385 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 2386 formats = []
044eeb14
S
2387
2388 def manifest_url(manifest):
2389 m_url = '%s/%s' % (http_base_url, manifest)
2390 if query:
2391 m_url += '?%s' % query
2392 return m_url
2393
6ad02195
RA
2394 if 'm3u8' not in skip_protocols:
2395 formats.extend(self._extract_m3u8_formats(
044eeb14 2396 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
2397 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2398 if 'f4m' not in skip_protocols:
2399 formats.extend(self._extract_f4m_formats(
044eeb14 2400 manifest_url('manifest.f4m'),
6ad02195 2401 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2402 if 'dash' not in skip_protocols:
2403 formats.extend(self._extract_mpd_formats(
044eeb14 2404 manifest_url('manifest.mpd'),
0384932e 2405 video_id, mpd_id='dash', fatal=False))
6ad02195 2406 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2407 if 'smil' not in skip_protocols:
2408 rtmp_formats = self._extract_smil_formats(
044eeb14 2409 manifest_url('jwplayer.smil'),
6ad02195
RA
2410 video_id, fatal=False)
2411 for rtmp_format in rtmp_formats:
2412 rtsp_format = rtmp_format.copy()
2413 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2414 del rtsp_format['play_path']
2415 del rtsp_format['ext']
2416 rtsp_format.update({
2417 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2418 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2419 'protocol': 'rtsp',
2420 })
2421 formats.extend([rtmp_format, rtsp_format])
2422 else:
2423 for protocol in ('rtmp', 'rtsp'):
2424 if protocol not in skip_protocols:
2425 formats.append({
f2e2f0c7 2426 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
2427 'format_id': protocol,
2428 'protocol': protocol,
2429 })
2430 return formats
2431
c73e330e 2432 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 2433 mobj = re.search(
ac9c69ac 2434 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
2435 webpage)
2436 if mobj:
c73e330e
RU
2437 try:
2438 jwplayer_data = self._parse_json(mobj.group('options'),
2439 video_id=video_id,
2440 transform_source=transform_source)
2441 except ExtractorError:
2442 pass
2443 else:
2444 if isinstance(jwplayer_data, dict):
2445 return jwplayer_data
a4a554a7
YCH
2446
2447 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
2448 jwplayer_data = self._find_jwplayer_data(
2449 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
2450 return self._parse_jwplayer_data(
2451 jwplayer_data, video_id, *args, **kwargs)
2452
2453 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2454 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2455 # JWPlayer backward compatibility: flattened playlists
2456 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2457 if 'playlist' not in jwplayer_data:
2458 jwplayer_data = {'playlist': [jwplayer_data]}
2459
2460 entries = []
2461
2462 # JWPlayer backward compatibility: single playlist item
2463 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2464 if not isinstance(jwplayer_data['playlist'], list):
2465 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2466
2467 for video_data in jwplayer_data['playlist']:
2468 # JWPlayer backward compatibility: flattened sources
2469 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2470 if 'sources' not in video_data:
2471 video_data['sources'] = [video_data]
2472
2473 this_video_id = video_id or video_data['mediaid']
2474
1a2192cb
S
2475 formats = self._parse_jwplayer_formats(
2476 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2477 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
2478
2479 subtitles = {}
2480 tracks = video_data.get('tracks')
2481 if tracks and isinstance(tracks, list):
2482 for track in tracks:
96a2daa1
S
2483 if not isinstance(track, dict):
2484 continue
f4b74272
S
2485 track_kind = track.get('kind')
2486 if not track_kind or not isinstance(track_kind, compat_str):
2487 continue
2488 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
2489 continue
2490 track_url = urljoin(base_url, track.get('file'))
2491 if not track_url:
2492 continue
2493 subtitles.setdefault(track.get('label') or 'en', []).append({
2494 'url': self._proto_relative_url(track_url)
2495 })
2496
50d808f5 2497 entry = {
a4a554a7 2498 'id': this_video_id,
50d808f5 2499 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
a4a554a7
YCH
2500 'description': video_data.get('description'),
2501 'thumbnail': self._proto_relative_url(video_data.get('image')),
2502 'timestamp': int_or_none(video_data.get('pubdate')),
2503 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2504 'subtitles': subtitles,
50d808f5
RA
2505 }
2506 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2507 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2508 entry.update({
2509 '_type': 'url_transparent',
2510 'url': formats[0]['url'],
2511 })
2512 else:
2513 self._sort_formats(formats)
2514 entry['formats'] = formats
2515 entries.append(entry)
a4a554a7
YCH
2516 if len(entries) == 1:
2517 return entries[0]
2518 else:
2519 return self.playlist_result(entries)
2520
ed0cf9b3
S
2521 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2522 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 2523 urls = []
ed0cf9b3 2524 formats = []
1a2192cb 2525 for source in jwplayer_sources_data:
0a268c6e
S
2526 if not isinstance(source, dict):
2527 continue
bf1b87cd
RA
2528 source_url = self._proto_relative_url(source.get('file'))
2529 if not source_url:
2530 continue
ed0cf9b3
S
2531 if base_url:
2532 source_url = compat_urlparse.urljoin(base_url, source_url)
bf1b87cd
RA
2533 if source_url in urls:
2534 continue
2535 urls.append(source_url)
ed0cf9b3
S
2536 source_type = source.get('type') or ''
2537 ext = mimetype2ext(source_type) or determine_ext(source_url)
2538 if source_type == 'hls' or ext == 'm3u8':
2539 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
2540 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2541 m3u8_id=m3u8_id, fatal=False))
0d9c48de 2542 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
2543 formats.extend(self._extract_mpd_formats(
2544 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
2545 elif ext == 'smil':
2546 formats.extend(self._extract_smil_formats(
2547 source_url, video_id, fatal=False))
ed0cf9b3 2548 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
2549 elif source_type.startswith('audio') or ext in (
2550 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
2551 formats.append({
2552 'url': source_url,
2553 'vcodec': 'none',
2554 'ext': ext,
2555 })
2556 else:
2557 height = int_or_none(source.get('height'))
2558 if height is None:
2559 # Often no height is provided but there is a label in
0236cd0d 2560 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 2561 height = int_or_none(self._search_regex(
0236cd0d 2562 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
2563 'height', default=None))
2564 a_format = {
2565 'url': source_url,
2566 'width': int_or_none(source.get('width')),
2567 'height': height,
0236cd0d 2568 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
2569 'ext': ext,
2570 }
2571 if source_url.startswith('rtmp'):
2572 a_format['ext'] = 'flv'
ed0cf9b3
S
2573 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2574 # of jwplayer.flash.swf
2575 rtmp_url_parts = re.split(
2576 r'((?:mp4|mp3|flv):)', source_url, 1)
2577 if len(rtmp_url_parts) == 3:
2578 rtmp_url, prefix, play_path = rtmp_url_parts
2579 a_format.update({
2580 'url': rtmp_url,
2581 'play_path': prefix + play_path,
2582 })
2583 if rtmp_params:
2584 a_format.update(rtmp_params)
2585 formats.append(a_format)
2586 return formats
2587
f4b1c7ad
PH
2588 def _live_title(self, name):
2589 """ Generate the title for a live video """
2590 now = datetime.datetime.now()
611c1dd9 2591 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
2592 return name + ' ' + now_str
2593
b14f3a4c
PH
2594 def _int(self, v, name, fatal=False, **kwargs):
2595 res = int_or_none(v, **kwargs)
2596 if 'get_attr' in kwargs:
2597 print(getattr(v, kwargs['get_attr']))
2598 if res is None:
2599 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2600 if fatal:
2601 raise ExtractorError(msg)
2602 else:
2603 self._downloader.report_warning(msg)
2604 return res
2605
2606 def _float(self, v, name, fatal=False, **kwargs):
2607 res = float_or_none(v, **kwargs)
2608 if res is None:
2609 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2610 if fatal:
2611 raise ExtractorError(msg)
2612 else:
2613 self._downloader.report_warning(msg)
2614 return res
2615
40e41780
TF
2616 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2617 path='/', secure=False, discard=False, rest={}, **kwargs):
810fb84d 2618 cookie = compat_cookiejar.Cookie(
4ed2d7b7 2619 0, name, value, port, port is not None, domain, True,
40e41780
TF
2620 domain.startswith('.'), path, True, secure, expire_time,
2621 discard, None, None, rest)
42939b61
JMF
2622 self._downloader.cookiejar.set_cookie(cookie)
2623
799207e8 2624 def _get_cookies(self, url):
2625 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 2626 req = sanitized_Request(url)
799207e8 2627 self._downloader.cookiejar.add_cookie_header(req)
2628 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2629
05900629
PH
2630 def get_testcases(self, include_onlymatching=False):
2631 t = getattr(self, '_TEST', None)
2632 if t:
2633 assert not hasattr(self, '_TESTS'), \
2634 '%s has _TEST and _TESTS' % type(self).__name__
2635 tests = [t]
2636 else:
2637 tests = getattr(self, '_TESTS', [])
2638 for t in tests:
2639 if not include_onlymatching and t.get('only_matching', False):
2640 continue
2641 t['name'] = type(self).__name__[:-len('IE')]
2642 yield t
2643
2644 def is_suitable(self, age_limit):
2645 """ Test whether the extractor is generally suitable for the given
2646 age limit (i.e. pornographic sites are not, all others usually are) """
2647
2648 any_restricted = False
2649 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 2650 if tc.get('playlist', []):
05900629
PH
2651 tc = tc['playlist'][0]
2652 is_restricted = age_restricted(
2653 tc.get('info_dict', {}).get('age_limit'), age_limit)
2654 if not is_restricted:
2655 return True
2656 any_restricted = any_restricted or is_restricted
2657 return not any_restricted
2658
a504ced0 2659 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
2660 if (self._downloader.params.get('writesubtitles', False) or
2661 self._downloader.params.get('listsubtitles')):
2662 return self._get_subtitles(*args, **kwargs)
2663 return {}
a504ced0
JMF
2664
2665 def _get_subtitles(self, *args, **kwargs):
611c1dd9 2666 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 2667
912e0b7e
YCH
2668 @staticmethod
2669 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2670 """ Merge subtitle items for one language. Items with duplicated URLs
2671 will be dropped. """
2672 list1_urls = set([item['url'] for item in subtitle_list1])
2673 ret = list(subtitle_list1)
2674 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2675 return ret
2676
2677 @classmethod
8c97f819 2678 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 2679 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
2680 ret = dict(subtitle_dict1)
2681 for lang in subtitle_dict2:
8c97f819 2682 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
2683 return ret
2684
360e1ca5 2685 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
2686 if (self._downloader.params.get('writeautomaticsub', False) or
2687 self._downloader.params.get('listsubtitles')):
2688 return self._get_automatic_captions(*args, **kwargs)
2689 return {}
360e1ca5
JMF
2690
2691 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 2692 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 2693
d77ab8e2
S
2694 def mark_watched(self, *args, **kwargs):
2695 if (self._downloader.params.get('mark_watched', False) and
2696 (self._get_login_info()[0] is not None or
2697 self._downloader.params.get('cookiefile') is not None)):
2698 self._mark_watched(*args, **kwargs)
2699
2700 def _mark_watched(self, *args, **kwargs):
2701 raise NotImplementedError('This method must be implemented by subclasses')
2702
38cce791
YCH
2703 def geo_verification_headers(self):
2704 headers = {}
2705 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2706 if geo_verification_proxy:
2707 headers['Ytdl-request-proxy'] = geo_verification_proxy
2708 return headers
2709
98763ee3
YCH
2710 def _generic_id(self, url):
2711 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2712
2713 def _generic_title(self, url):
2714 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2715
8dbe9899 2716
d6983cb4
PH
2717class SearchInfoExtractor(InfoExtractor):
2718 """
2719 Base class for paged search queries extractors.
10952eb2 2720 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
2721 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2722 """
2723
2724 @classmethod
2725 def _make_valid_url(cls):
2726 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2727
2728 @classmethod
2729 def suitable(cls, url):
2730 return re.match(cls._make_valid_url(), url) is not None
2731
2732 def _real_extract(self, query):
2733 mobj = re.match(self._make_valid_url(), query)
2734 if mobj is None:
f1a9d64e 2735 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
2736
2737 prefix = mobj.group('prefix')
2738 query = mobj.group('query')
2739 if prefix == '':
2740 return self._get_n_results(query, 1)
2741 elif prefix == 'all':
2742 return self._get_n_results(query, self._MAX_RESULTS)
2743 else:
2744 n = int(prefix)
2745 if n <= 0:
f1a9d64e 2746 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 2747 elif n > self._MAX_RESULTS:
f1a9d64e 2748 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
2749 n = self._MAX_RESULTS
2750 return self._get_n_results(query, n)
2751
2752 def _get_n_results(self, query, n):
2753 """Get a specified number of results for a query"""
611c1dd9 2754 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
2755
2756 @property
2757 def SEARCH_KEY(self):
2758 return self._SEARCH_KEY