]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[BostonGlobe] New. Nonstandard version of Brightcove.
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
6a3828fd 1from __future__ import unicode_literals
f1a9d64e 2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4 8import os
773f291d 9import random
d6983cb4
PH
10import re
11import socket
12import sys
4094b6e3 13import time
1bac3455 14import math
d6983cb4 15
8c25f81b 16from ..compat import (
42939b61 17 compat_cookiejar,
799207e8 18 compat_cookies,
e9c0cdd3 19 compat_etree_fromstring,
e64b7569 20 compat_getpass,
d6983cb4 21 compat_http_client,
e9c0cdd3
YCH
22 compat_os_name,
23 compat_str,
d6983cb4 24 compat_urllib_error,
98763ee3 25 compat_urllib_parse_unquote,
15707c7e 26 compat_urllib_parse_urlencode,
41d06b04 27 compat_urllib_request,
f0b5d6af 28 compat_urlparse,
8c25f81b 29)
b22ca762 30from ..downloader.f4m import remove_encrypted_media
8c25f81b 31from ..utils import (
c342041f 32 NO_DEFAULT,
05900629 33 age_restricted,
02dc0a36 34 base_url,
08f2a92c 35 bug_reports_message,
d6983cb4
PH
36 clean_html,
37 compiled_regex_type,
70f0f5a8 38 determine_ext,
46b18f23 39 determine_protocol,
9b9c5355 40 error_to_compat_str,
d6983cb4 41 ExtractorError,
46b18f23 42 extract_attributes,
97f4aecf 43 fix_xml_ampersands,
b14f3a4c 44 float_or_none,
773f291d
S
45 GeoRestrictedError,
46 GeoUtils,
31bb8d3f 47 int_or_none,
a4a554a7 48 js_to_json,
46b18f23
JH
49 mimetype2ext,
50 orderedSet,
51 parse_codecs,
52 parse_duration,
4ca2a3cf 53 parse_iso8601,
46b18f23 54 parse_m3u8_attributes,
55b3e45b 55 RegexNotFoundError,
5c2266df 56 sanitized_Request,
46b18f23 57 sanitize_filename,
f38de77f 58 unescapeHTML,
647eab45 59 unified_strdate,
6b3a3098 60 unified_timestamp,
46b18f23
JH
61 update_Request,
62 update_url_query,
63 urljoin,
a107193e 64 url_basename,
a6571f10 65 xpath_element,
8d6765cf
S
66 xpath_text,
67 xpath_with_ns,
d6983cb4 68)
c342041f 69
d6983cb4
PH
70
71class InfoExtractor(object):
72 """Information Extractor class.
73
74 Information extractors are the classes that, given a URL, extract
75 information about the video (or videos) the URL refers to. This
76 information includes the real video URL, the video title, author and
77 others. The information is stored in a dictionary which is then
5d380852 78 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
79 information possibly downloading the video to the file system, among
80 other possible outcomes.
81
cf0649f8 82 The type field determines the type of the result.
fed5d032
PH
83 By far the most common value (and the default if _type is missing) is
84 "video", which indicates a single video.
85
86 For a video, the dictionaries must include the following fields:
d6983cb4
PH
87
88 id: Video identifier.
d6983cb4 89 title: Video title, unescaped.
d67b0b15 90
f49d89ee 91 Additionally, it must contain either a formats entry or a url one:
d67b0b15 92
f49d89ee
PH
93 formats: A list of dictionaries for each format available, ordered
94 from worst to best quality.
95
96 Potential fields:
86f4d14f
S
97 * url Mandatory. The URL of the video file
98 * manifest_url
99 The URL of the manifest file in case of
100 fragmented media (DASH, hls, hds)
10952eb2 101 * ext Will be calculated from URL if missing
d67b0b15
PH
102 * format A human-readable description of the format
103 ("mp4 container with h264/opus").
104 Calculated from the format_id, width, height.
105 and format_note fields if missing.
106 * format_id A short description of the format
5d4f3985
PH
107 ("mp4_h264_opus" or "19").
108 Technically optional, but strongly recommended.
d67b0b15
PH
109 * format_note Additional info about the format
110 ("3D" or "DASH video")
111 * width Width of the video, if known
112 * height Height of the video, if known
f49d89ee 113 * resolution Textual description of width and height
7217e148 114 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
115 * abr Average audio bitrate in KBit/s
116 * acodec Name of the audio codec in use
dd27fd17 117 * asr Audio sampling rate in Hertz
d67b0b15 118 * vbr Average video bitrate in KBit/s
fbb21cf5 119 * fps Frame rate
d67b0b15 120 * vcodec Name of the video codec in use
1394ce65 121 * container Name of the container format
d67b0b15 122 * filesize The number of bytes, if known in advance
9732d77e 123 * filesize_approx An estimate for the number of bytes
d67b0b15 124 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
125 * protocol The protocol that will be used for the actual
126 download, lower-case.
b04b8852 127 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 128 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
129 * fragment_base_url
130 Base URL for fragments. Each fragment's path
131 value (if present) will be relative to
132 this URL.
133 * fragments A list of fragments of a fragmented media.
134 Each fragment entry must contain either an url
135 or a path. If an url is present it should be
136 considered by a client. Otherwise both path and
137 fragment_base_url must be present. Here is
138 the list of all potential fields:
139 * "url" - fragment's URL
140 * "path" - fragment's path relative to
141 fragment_base_url
a0d5077c
S
142 * "duration" (optional, int or float)
143 * "filesize" (optional, int)
f49d89ee 144 * preference Order number of this format. If this field is
08d13955 145 present and not None, the formats get sorted
38d63d84 146 by this field, regardless of all other values.
f49d89ee
PH
147 -1 for default (order by other properties),
148 -2 or smaller for less than default.
e65566a9
PH
149 < -1000 to hide the format (if there is
150 another one which is strictly better)
32f90364
PH
151 * language Language code, e.g. "de" or "en-US".
152 * language_preference Is this in the language mentioned in
153 the URL?
aff2f4f4
PH
154 10 if it's what the URL is about,
155 -1 for default (don't know),
156 -10 otherwise, other values reserved for now.
5d73273f
PH
157 * quality Order number of the video quality of this
158 format, irrespective of the file format.
159 -1 for default (order by other properties),
160 -2 or smaller for less than default.
c64ed2a3
PH
161 * source_preference Order number for this video source
162 (quality takes higher priority)
163 -1 for default (order by other properties),
164 -2 or smaller for less than default.
d769be6c
PH
165 * http_headers A dictionary of additional HTTP headers
166 to add to the request.
6271f1ca 167 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
168 video's pixels are not square.
169 width : height ratio as float.
170 * no_resume The server does not support resuming the
171 (HTTP or RTMP) download. Boolean.
172
c0ba0f48 173 url: Final video URL.
d6983cb4 174 ext: Video filename extension.
d67b0b15
PH
175 format: The video format, defaults to ext (used for --get-format)
176 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 177
d6983cb4
PH
178 The following fields are optional:
179
f5e43bc6 180 alt_title: A secondary title of the video.
0afef30b
PH
181 display_id An alternative identifier for the video, not necessarily
182 unique, but available before title. Typically, id is
183 something like "4234987", title "Dancing naked mole rats",
184 and display_id "dancing-naked-mole-rats"
d5519808 185 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 186 * "id" (optional, string) - Thumbnail format ID
d5519808 187 * "url"
cfb56d1a 188 * "preference" (optional, int) - quality of the image
d5519808
PH
189 * "width" (optional, int)
190 * "height" (optional, int)
191 * "resolution" (optional, string "{width}x{height"},
192 deprecated)
2de624fd 193 * "filesize" (optional, int)
d6983cb4 194 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 195 description: Full video description.
d6983cb4 196 uploader: Full name of the video uploader.
2bc0c46f 197 license: License name the video is licensed under.
8a92e51c 198 creator: The creator of the video.
8aab976b 199 release_date: The date (YYYYMMDD) when the video was released.
955c4514 200 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 201 upload_date: Video upload date (YYYYMMDD).
955c4514 202 If not explicitly set, calculated from timestamp.
d6983cb4 203 uploader_id: Nickname or id of the video uploader.
7bcd2830 204 uploader_url: Full URL to a personal webpage of the video uploader.
da9ec3b9 205 location: Physical location where the video was filmed.
a504ced0 206 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
207 {tag: subformats}. "tag" is usually a language code, and
208 "subformats" is a list sorted from lower to higher
209 preference, each element is a dictionary with the "ext"
210 entry and one of:
a504ced0 211 * "data": The subtitles file contents
10952eb2 212 * "url": A URL pointing to the subtitles file
4bba3716 213 "ext" will be calculated from URL if missing
360e1ca5
JMF
214 automatic_captions: Like 'subtitles', used by the YoutubeIE for
215 automatically generated captions
62d231c0 216 duration: Length of the video in seconds, as an integer or float.
f3d29461 217 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
218 like_count: Number of positive ratings of the video
219 dislike_count: Number of negative ratings of the video
02835c6b 220 repost_count: Number of reposts of the video
2d30521a 221 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 222 comment_count: Number of comments on the video
dd622d7c
PH
223 comments: A list of comments, each with one or more of the following
224 properties (all but one of text or html optional):
225 * "author" - human-readable name of the comment author
226 * "author_id" - user ID of the comment author
227 * "id" - Comment ID
228 * "html" - Comment as HTML
229 * "text" - Plain text of the comment
230 * "timestamp" - UNIX timestamp of comment
231 * "parent" - ID of the comment this one is replying to.
232 Set to "root" to indicate that this is a
233 comment to the original video.
8dbe9899 234 age_limit: Age restriction for the video, as an integer (years)
10952eb2 235 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
236 should allow to get the same result again. (It will be set
237 by YoutubeDL if it's missing)
ad3bc6ac
PH
238 categories: A list of categories that the video falls in, for example
239 ["Sports", "Berlin"]
864f24bd 240 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
241 is_live: True, False, or None (=unknown). Whether this video is a
242 live stream that goes on instead of a fixed-length video.
7c80519c 243 start_time: Time in seconds where the reproduction should start, as
10952eb2 244 specified in the URL.
297a564b 245 end_time: Time in seconds where the reproduction should end, as
10952eb2 246 specified in the URL.
d6983cb4 247
7109903e
S
248 The following fields should only be used when the video belongs to some logical
249 chapter or section:
250
251 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
252 chapter_number: Number of the chapter the video belongs to, as an integer.
253 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
254
255 The following fields should only be used when the video is an episode of some
8d76bdf1 256 series, programme or podcast:
7109903e
S
257
258 series: Title of the series or programme the video episode belongs to.
259 season: Title of the season the video episode belongs to.
27bfd4e5
S
260 season_number: Number of the season the video episode belongs to, as an integer.
261 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
262 episode: Title of the video episode. Unlike mandatory video title field,
263 this field should denote the exact title of the video episode
264 without any kind of decoration.
27bfd4e5
S
265 episode_number: Number of the video episode within a season, as an integer.
266 episode_id: Id of the video episode, as a unicode string.
7109903e 267
7a93ab5f
S
268 The following fields should only be used when the media is a track or a part of
269 a music album:
270
271 track: Title of the track.
272 track_number: Number of the track within an album or a disc, as an integer.
273 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 as a unicode string.
275 artist: Artist(s) of the track.
276 genre: Genre(s) of the track.
277 album: Title of the album the track belongs to.
278 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
279 album_artist: List of all artists appeared on the album (e.g.
280 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 and compilations).
282 disc_number: Number of the disc or other physical medium the track belongs to,
283 as an integer.
284 release_year: Year (YYYY) when the album was released.
285
deefc05b 286 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 287
d838b1bd
PH
288 Unless mentioned otherwise, None is equivalent to absence of information.
289
fed5d032
PH
290
291 _type "playlist" indicates multiple videos.
b82f815f
PH
292 There must be a key "entries", which is a list, an iterable, or a PagedList
293 object, each element of which is a valid dictionary by this specification.
fed5d032 294
e0b9d78f
S
295 Additionally, playlists can have "title", "description" and "id" attributes
296 with the same semantics as videos (see above).
fed5d032
PH
297
298
299 _type "multi_video" indicates that there are multiple videos that
300 form a single show, for examples multiple acts of an opera or TV episode.
301 It must have an entries key like a playlist and contain all the keys
302 required for a video at the same time.
303
304
305 _type "url" indicates that the video must be extracted from another
306 location, possibly by a different extractor. Its only required key is:
307 "url" - the next URL to extract.
f58766ce
PH
308 The key "ie_key" can be set to the class name (minus the trailing "IE",
309 e.g. "Youtube") if the extractor class is known in advance.
310 Additionally, the dictionary may have any properties of the resolved entity
311 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
312 known ahead of time.
313
314
315 _type "url_transparent" entities have the same specification as "url", but
316 indicate that the given additional information is more precise than the one
317 associated with the resolved URL.
318 This is useful when a site employs a video service that hosts the video and
319 its technical metadata, but that video service does not embed a useful
320 title, description etc.
321
322
d6983cb4
PH
323 Subclasses of this one should re-define the _real_initialize() and
324 _real_extract() methods and define a _VALID_URL regexp.
325 Probably, they should also be added to the list of extractors.
326
4248dad9 327 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
328 geo restriction bypass mechanisms for a particular extractor.
329 Though it won't disable explicit geo restriction bypass based on
4248dad9
S
330 country code provided with geo_bypass_country. (experimental)
331
332 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
333 countries for this extractor. One of these countries will be used by
334 geo restriction bypass mechanism right away in order to bypass
335 geo restriction, of course, if the mechanism is not disabled. (experimental)
773f291d 336
3ccdde8c
S
337 NB: both these geo attributes are experimental and may change in future
338 or be completely removed.
339
d6983cb4
PH
340 Finally, the _WORKING attribute should be set to False for broken IEs
341 in order to warn the users and skip the tests.
342 """
343
344 _ready = False
345 _downloader = None
773f291d 346 _x_forwarded_for_ip = None
4248dad9
S
347 _GEO_BYPASS = True
348 _GEO_COUNTRIES = None
d6983cb4
PH
349 _WORKING = True
350
351 def __init__(self, downloader=None):
352 """Constructor. Receives an optional downloader."""
353 self._ready = False
773f291d 354 self._x_forwarded_for_ip = None
d6983cb4
PH
355 self.set_downloader(downloader)
356
357 @classmethod
358 def suitable(cls, url):
359 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
360
361 # This does not use has/getattr intentionally - we want to know whether
362 # we have cached the regexp for *this* class, whereas getattr would also
363 # match the superclass
364 if '_VALID_URL_RE' not in cls.__dict__:
365 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
366 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 367
ed9266db
PH
368 @classmethod
369 def _match_id(cls, url):
370 if '_VALID_URL_RE' not in cls.__dict__:
371 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
372 m = cls._VALID_URL_RE.match(url)
373 assert m
374 return m.group('id')
375
d6983cb4
PH
376 @classmethod
377 def working(cls):
378 """Getter method for _WORKING."""
379 return cls._WORKING
380
381 def initialize(self):
382 """Initializes an instance (authentication, etc)."""
e39b5d4a 383 self._initialize_geo_bypass(self._GEO_COUNTRIES)
4248dad9
S
384 if not self._ready:
385 self._real_initialize()
386 self._ready = True
387
e39b5d4a
S
388 def _initialize_geo_bypass(self, countries):
389 """
390 Initialize geo restriction bypass mechanism.
391
392 This method is used to initialize geo bypass mechanism based on faking
393 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 394 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
395 IP will be passed as X-Forwarded-For HTTP header in all subsequent
396 HTTP requests.
e39b5d4a
S
397
398 This method will be used for initial geo bypass mechanism initialization
399 during the instance initialization with _GEO_COUNTRIES.
400
401 You may also manually call it from extractor's code if geo countries
402 information is not available beforehand (e.g. obtained during
403 extraction) or due to some another reason.
404 """
773f291d 405 if not self._x_forwarded_for_ip:
0a840f58 406 country_code = self._downloader.params.get('geo_bypass_country', None)
4248dad9
S
407 # If there is no explicit country for geo bypass specified and
408 # the extractor is known to be geo restricted let's fake IP
409 # as X-Forwarded-For right away.
410 if (not country_code and
411 self._GEO_BYPASS and
412 self._downloader.params.get('geo_bypass', True) and
e39b5d4a
S
413 countries):
414 country_code = random.choice(countries)
773f291d
S
415 if country_code:
416 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
417 if self._downloader.params.get('verbose', False):
418 self._downloader.to_stdout(
eea0716c
S
419 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
420 % (self._x_forwarded_for_ip, country_code.upper()))
d6983cb4
PH
421
422 def extract(self, url):
423 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 424 try:
773f291d
S
425 for _ in range(2):
426 try:
427 self.initialize()
0016b84e
S
428 ie_result = self._real_extract(url)
429 if self._x_forwarded_for_ip:
430 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
431 return ie_result
773f291d 432 except GeoRestrictedError as e:
4248dad9
S
433 if self.__maybe_fake_ip_and_retry(e.countries):
434 continue
773f291d 435 raise
3a5bcd03
PH
436 except ExtractorError:
437 raise
438 except compat_http_client.IncompleteRead as e:
dfb1b146 439 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 440 except (KeyError, StopIteration) as e:
dfb1b146 441 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 442
4248dad9
S
443 def __maybe_fake_ip_and_retry(self, countries):
444 if (not self._downloader.params.get('geo_bypass_country', None) and
445 self._GEO_BYPASS and
446 self._downloader.params.get('geo_bypass', True) and
447 not self._x_forwarded_for_ip and
448 countries):
eea0716c
S
449 country_code = random.choice(countries)
450 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
451 if self._x_forwarded_for_ip:
452 self.report_warning(
eea0716c
S
453 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
454 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
455 return True
456 return False
457
d6983cb4
PH
458 def set_downloader(self, downloader):
459 """Sets the downloader for this IE."""
460 self._downloader = downloader
461
462 def _real_initialize(self):
463 """Real initialization process. Redefine in subclasses."""
464 pass
465
466 def _real_extract(self, url):
467 """Real extraction process. Redefine in subclasses."""
468 pass
469
56c73665
JMF
470 @classmethod
471 def ie_key(cls):
472 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 473 return compat_str(cls.__name__[:-2])
56c73665 474
d6983cb4
PH
475 @property
476 def IE_NAME(self):
dc519b54 477 return compat_str(type(self).__name__[:-2])
d6983cb4 478
41d06b04 479 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
d6983cb4
PH
480 """ Returns the response handle """
481 if note is None:
482 self.report_download_webpage(video_id)
483 elif note is not False:
7cc3570e 484 if video_id is None:
f1a9d64e 485 self.to_screen('%s' % (note,))
7cc3570e 486 else:
f1a9d64e 487 self.to_screen('%s: %s' % (video_id, note))
41d06b04
S
488 if isinstance(url_or_request, compat_urllib_request.Request):
489 url_or_request = update_Request(
490 url_or_request, data=data, headers=headers, query=query)
491 else:
cdfee168 492 if query:
493 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 494 if data is not None or headers:
41d06b04 495 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 496 try:
dca08720 497 return self._downloader.urlopen(url_or_request)
d6983cb4 498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
499 if errnote is False:
500 return False
d6983cb4 501 if errnote is None:
f1a9d64e 502 errnote = 'Unable to download webpage'
7f8b2714 503
9b9c5355 504 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
505 if fatal:
506 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
507 else:
508 self._downloader.report_warning(errmsg)
509 return False
d6983cb4 510
41d06b04 511 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
d6983cb4 512 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
513 # Strip hashes from the URL (#1038)
514 if isinstance(url_or_request, (compat_str, str)):
515 url_or_request = url_or_request.partition('#')[0]
516
773f291d
S
517 # Some sites check X-Forwarded-For HTTP header in order to figure out
518 # the origin of the client behind proxy. This allows bypassing geo
519 # restriction by faking this header's value to IP that belongs to some
520 # geo unrestricted country. We will do so once we encounter any
521 # geo restriction error.
522 if self._x_forwarded_for_ip:
523 if 'X-Forwarded-For' not in headers:
524 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
525
cdfee168 526 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
7cc3570e
PH
527 if urlh is False:
528 assert not fatal
529 return False
c9a77969 530 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
531 return (content, urlh)
532
c9a77969
YCH
533 @staticmethod
534 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
535 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
536 if m:
537 encoding = m.group(1)
538 else:
0d75ae2c 539 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
540 webpage_bytes[:1024])
541 if m:
542 encoding = m.group(1).decode('ascii')
b60016e8
PH
543 elif webpage_bytes.startswith(b'\xff\xfe'):
544 encoding = 'utf-16'
f143d86a
PH
545 else:
546 encoding = 'utf-8'
c9a77969
YCH
547
548 return encoding
549
550 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
551 content_type = urlh.headers.get('Content-Type', '')
552 webpage_bytes = urlh.read()
553 if prefix is not None:
554 webpage_bytes = prefix + webpage_bytes
555 if not encoding:
556 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
557 if self._downloader.params.get('dump_intermediate_pages', False):
558 try:
559 url = url_or_request.get_full_url()
560 except AttributeError:
561 url = url_or_request
f1a9d64e 562 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
563 dump = base64.b64encode(webpage_bytes).decode('ascii')
564 self._downloader.to_screen(dump)
d41e6efc
PH
565 if self._downloader.params.get('write_pages', False):
566 try:
567 url = url_or_request.get_full_url()
568 except AttributeError:
569 url = url_or_request
5afa7f8b 570 basen = '%s_%s' % (video_id, url)
c1bce22f 571 if len(basen) > 240:
f1a9d64e 572 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
573 basen = basen[:240 - len(h)] + h
574 raw_filename = basen + '.dump'
d41e6efc 575 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 576 self.to_screen('Saving request to ' + filename)
5f58165d
S
577 # Working around MAX_PATH limitation on Windows (see
578 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 579 if compat_os_name == 'nt':
5f58165d
S
580 absfilepath = os.path.abspath(filename)
581 if len(absfilepath) > 259:
582 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
583 with open(filename, 'wb') as outf:
584 outf.write(webpage_bytes)
585
ec0fafbb
AA
586 try:
587 content = webpage_bytes.decode(encoding, 'replace')
588 except LookupError:
589 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 590
f1a9d64e
PH
591 if ('<title>Access to this site is blocked</title>' in content and
592 'Websense' in content[:512]):
593 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
594 blocked_iframe = self._html_search_regex(
595 r'<iframe src="([^"]+)"', content,
f1a9d64e 596 'Websense information URL', default=None)
2410c43d 597 if blocked_iframe:
f1a9d64e 598 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 599 raise ExtractorError(msg, expected=True)
77b2986b
PH
600 if '<title>The URL you requested has been blocked</title>' in content[:512]:
601 msg = (
602 'Access to this webpage has been blocked by Indian censorship. '
603 'Use a VPN or proxy server (with --proxy) to route around it.')
604 block_msg = self._html_search_regex(
605 r'</h1><p>(.*?)</p>',
606 content, 'block message', default=None)
607 if block_msg:
608 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
609 raise ExtractorError(msg, expected=True)
2410c43d 610
23be51d8 611 return content
d6983cb4 612
41d06b04 613 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
d6983cb4 614 """ Returns the data of the page as a string """
995ad69c
TF
615 success = False
616 try_count = 0
617 while success is False:
618 try:
cdfee168 619 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
995ad69c
TF
620 success = True
621 except compat_http_client.IncompleteRead as e:
622 try_count += 1
623 if try_count >= tries:
624 raise e
625 self._sleep(timeout, video_id)
7cc3570e
PH
626 if res is False:
627 return res
628 else:
629 content, _ = res
630 return content
d6983cb4 631
2a275ab0 632 def _download_xml(self, url_or_request, video_id,
f1a9d64e 633 note='Downloading XML', errnote='Unable to download XML',
41d06b04 634 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
267ed0c5 635 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 636 xml_string = self._download_webpage(
cdfee168 637 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
28746fbd
PH
638 if xml_string is False:
639 return xml_string
e2b38da9
PH
640 if transform_source:
641 xml_string = transform_source(xml_string)
36e6f62c 642 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 643
3d3538e4 644 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
645 note='Downloading JSON metadata',
646 errnote='Unable to download JSON metadata',
b090af59 647 transform_source=None,
41d06b04 648 fatal=True, encoding=None, data=None, headers={}, query={}):
b090af59 649 json_string = self._download_webpage(
c9a77969 650 url_or_request, video_id, note, errnote, fatal=fatal,
cdfee168 651 encoding=encoding, data=data, headers=headers, query=query)
b090af59
PH
652 if (not fatal) and json_string is False:
653 return None
ebb64199
TF
654 return self._parse_json(
655 json_string, video_id, transform_source=transform_source, fatal=fatal)
656
657 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
658 if transform_source:
659 json_string = transform_source(json_string)
3d3538e4
PH
660 try:
661 return json.loads(json_string)
662 except ValueError as ve:
e7b6d122
PH
663 errmsg = '%s: Failed to parse JSON ' % video_id
664 if fatal:
665 raise ExtractorError(errmsg, cause=ve)
666 else:
667 self.report_warning(errmsg + str(ve))
3d3538e4 668
f45f96f8 669 def report_warning(self, msg, video_id=None):
f1a9d64e 670 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 671 self._downloader.report_warning(
f1a9d64e 672 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 673
d6983cb4
PH
674 def to_screen(self, msg):
675 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 676 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
677
678 def report_extraction(self, id_or_name):
679 """Report information extraction."""
f1a9d64e 680 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
681
682 def report_download_webpage(self, video_id):
683 """Report webpage download."""
f1a9d64e 684 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
685
686 def report_age_confirmation(self):
687 """Report attempt to confirm age."""
f1a9d64e 688 self.to_screen('Confirming age')
d6983cb4 689
fc79158d
JMF
690 def report_login(self):
691 """Report attempt to log in."""
f1a9d64e 692 self.to_screen('Logging in')
fc79158d 693
43e7d3c9
S
694 @staticmethod
695 def raise_login_required(msg='This video is only available for registered users'):
696 raise ExtractorError(
697 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
698 expected=True)
699
c430802e 700 @staticmethod
773f291d
S
701 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
702 raise GeoRestrictedError(msg, countries=countries)
c430802e 703
5f6a1245 704 # Methods for following #608
c0d0b01f 705 @staticmethod
830d53bf 706 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 707 """Returns a URL that points to a page that should be processed"""
5f6a1245 708 # TODO: ie should be the class used for getting the info
d6983cb4
PH
709 video_info = {'_type': 'url',
710 'url': url,
711 'ie_key': ie}
7012b23c
PH
712 if video_id is not None:
713 video_info['id'] = video_id
830d53bf
S
714 if video_title is not None:
715 video_info['title'] = video_title
d6983cb4 716 return video_info
5f6a1245 717
46b18f23
JH
718 def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
719 urlrs = orderedSet(
720 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
721 for m in matches)
722 return self.playlist_result(
723 urlrs, playlist_id=video_id, playlist_title=video_title)
724
c0d0b01f 725 @staticmethod
acf5cbfe 726 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
727 """Returns a playlist"""
728 video_info = {'_type': 'playlist',
729 'entries': entries}
730 if playlist_id:
731 video_info['id'] = playlist_id
732 if playlist_title:
733 video_info['title'] = playlist_title
acf5cbfe
S
734 if playlist_description:
735 video_info['description'] = playlist_description
d6983cb4
PH
736 return video_info
737
c342041f 738 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
739 """
740 Perform a regex search on the given string, using a single or a list of
741 patterns returning the first matching group.
742 In case of failure return a default value or raise a WARNING or a
55b3e45b 743 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
744 """
745 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
746 mobj = re.search(pattern, string, flags)
747 else:
748 for p in pattern:
749 mobj = re.search(p, string, flags)
c3415d1b
PH
750 if mobj:
751 break
d6983cb4 752
e9c0cdd3 753 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 754 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
755 else:
756 _name = name
757
758 if mobj:
711ede6e
PH
759 if group is None:
760 # return the first matching group
761 return next(g for g in mobj.groups() if g is not None)
762 else:
763 return mobj.group(group)
c342041f 764 elif default is not NO_DEFAULT:
d6983cb4
PH
765 return default
766 elif fatal:
f1a9d64e 767 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 768 else:
08f2a92c 769 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
770 return None
771
c342041f 772 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
773 """
774 Like _search_regex, but strips HTML tags and unescapes entities.
775 """
711ede6e 776 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
777 if res:
778 return clean_html(res).strip()
779 else:
780 return res
781
2118fdd1
RA
782 def _get_netrc_login_info(self, netrc_machine=None):
783 username = None
784 password = None
785 netrc_machine = netrc_machine or self._NETRC_MACHINE
786
787 if self._downloader.params.get('usenetrc', False):
788 try:
789 info = netrc.netrc().authenticators(netrc_machine)
790 if info is not None:
791 username = info[0]
792 password = info[2]
793 else:
dcce092e
S
794 raise netrc.NetrcParseError(
795 'No authenticators for %s' % netrc_machine)
2118fdd1 796 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
797 self._downloader.report_warning(
798 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 799
dcce092e 800 return username, password
2118fdd1 801
1b6712ab 802 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 803 """
cf0649f8 804 Get the login info as (username, password)
32443dd3
S
805 First look for the manually specified credentials using username_option
806 and password_option as keys in params dictionary. If no such credentials
807 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
808 value.
fc79158d
JMF
809 If there's no info available, return (None, None)
810 """
811 if self._downloader is None:
812 return (None, None)
813
fc79158d
JMF
814 downloader_params = self._downloader.params
815
816 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
817 if downloader_params.get(username_option) is not None:
818 username = downloader_params[username_option]
819 password = downloader_params[password_option]
2118fdd1 820 else:
1b6712ab 821 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 822
2133565c 823 return username, password
fc79158d 824
e64b7569 825 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 826 """
827 Get the two-factor authentication info
828 TODO - asking the user will be required for sms/phone verify
829 currently just uses the command line option
830 If there's no info available, return None
831 """
832 if self._downloader is None:
83317f69 833 return None
834 downloader_params = self._downloader.params
835
d800609c 836 if downloader_params.get('twofactor') is not None:
83317f69 837 return downloader_params['twofactor']
838
e64b7569 839 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 840
46720279
JMF
841 # Helper functions for extracting OpenGraph info
842 @staticmethod
ab2d5247 843 def _og_regexes(prop):
448ef1f3 844 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
845 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
846 % {'prop': re.escape(prop)})
78fb87b2 847 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 848 return [
78fb87b2
JMF
849 template % (property_re, content_re),
850 template % (content_re, property_re),
ab2d5247 851 ]
46720279 852
864f24bd
S
853 @staticmethod
854 def _meta_regex(prop):
855 return r'''(?isx)<meta
8b9848ac 856 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
857 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
858
3c4e6d83 859 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
860 if not isinstance(prop, (list, tuple)):
861 prop = [prop]
46720279 862 if name is None:
b070564e
S
863 name = 'OpenGraph %s' % prop[0]
864 og_regexes = []
865 for p in prop:
866 og_regexes.extend(self._og_regexes(p))
867 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
868 if escaped is None:
869 return None
870 return unescapeHTML(escaped)
46720279
JMF
871
872 def _og_search_thumbnail(self, html, **kargs):
10952eb2 873 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
874
875 def _og_search_description(self, html, **kargs):
876 return self._og_search_property('description', html, fatal=False, **kargs)
877
878 def _og_search_title(self, html, **kargs):
879 return self._og_search_property('title', html, **kargs)
880
8ffa13e0 881 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
882 regexes = self._og_regexes('video') + self._og_regexes('video:url')
883 if secure:
884 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 885 return self._html_search_regex(regexes, html, name, **kargs)
46720279 886
78338f71
JMF
887 def _og_search_url(self, html, **kargs):
888 return self._og_search_property('url', html, **kargs)
889
40c696e5 890 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
891 if not isinstance(name, (list, tuple)):
892 name = [name]
59040888 893 if display_name is None:
88d9f6c0 894 display_name = name[0]
59040888 895 return self._html_search_regex(
88d9f6c0 896 [self._meta_regex(n) for n in name],
711ede6e 897 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
898
899 def _dc_search_uploader(self, html):
900 return self._html_search_meta('dc.creator', html, 'uploader')
901
8dbe9899
PH
902 def _rta_search(self, html):
903 # See http://www.rtalabel.org/index.php?content=howtofaq#single
904 if re.search(r'(?ix)<meta\s+name="rating"\s+'
905 r' content="RTA-5042-1996-1400-1577-RTA"',
906 html):
907 return 18
908 return 0
909
59040888
PH
910 def _media_rating_search(self, html):
911 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
912 rating = self._html_search_meta('rating', html)
913
914 if not rating:
915 return None
916
917 RATING_TABLE = {
918 'safe for kids': 0,
919 'general': 8,
920 '14 years': 14,
921 'mature': 17,
922 'restricted': 19,
923 }
d800609c 924 return RATING_TABLE.get(rating.lower())
59040888 925
69319969 926 def _family_friendly_search(self, html):
6ca7732d 927 # See http://schema.org/VideoObject
69319969
NJ
928 family_friendly = self._html_search_meta('isFamilyFriendly', html)
929
930 if not family_friendly:
931 return None
932
933 RATING_TABLE = {
934 '1': 0,
935 'true': 0,
936 '0': 18,
937 'false': 18,
938 }
d800609c 939 return RATING_TABLE.get(family_friendly.lower())
69319969 940
0c708f11
JMF
941 def _twitter_search_player(self, html):
942 return self._html_search_meta('twitter:player', html,
9e1a5b84 943 'twitter card player')
0c708f11 944
95b31e26 945 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4ca2a3cf
S
946 json_ld = self._search_regex(
947 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 948 html, 'JSON-LD', group='json_ld', **kwargs)
321b5e08 949 default = kwargs.get('default', NO_DEFAULT)
4ca2a3cf 950 if not json_ld:
321b5e08
S
951 return default if default is not NO_DEFAULT else {}
952 # JSON-LD may be malformed and thus `fatal` should be respected.
953 # At the same time `default` may be passed that assumes `fatal=False`
954 # for _search_regex. Let's simulate the same behavior here as well.
955 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
956 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
4ca2a3cf 957
95b31e26 958 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
959 if isinstance(json_ld, compat_str):
960 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
961 if not json_ld:
962 return {}
963 info = {}
46933a15
S
964 if not isinstance(json_ld, (list, tuple, dict)):
965 return info
966 if isinstance(json_ld, dict):
967 json_ld = [json_ld]
968 for e in json_ld:
969 if e.get('@context') == 'http://schema.org':
970 item_type = e.get('@type')
971 if expected_type is not None and expected_type != item_type:
972 return info
973 if item_type == 'TVEpisode':
974 info.update({
975 'episode': unescapeHTML(e.get('name')),
976 'episode_number': int_or_none(e.get('episodeNumber')),
977 'description': unescapeHTML(e.get('description')),
978 })
979 part_of_season = e.get('partOfSeason')
980 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
981 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
d16b3c66 982 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
46933a15
S
983 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
984 info['series'] = unescapeHTML(part_of_series.get('name'))
985 elif item_type == 'Article':
986 info.update({
987 'timestamp': parse_iso8601(e.get('datePublished')),
988 'title': unescapeHTML(e.get('headline')),
989 'description': unescapeHTML(e.get('articleBody')),
990 })
991 elif item_type == 'VideoObject':
992 info.update({
993 'url': e.get('contentUrl'),
994 'title': unescapeHTML(e.get('name')),
995 'description': unescapeHTML(e.get('description')),
f076d797 996 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
46933a15
S
997 'duration': parse_duration(e.get('duration')),
998 'timestamp': unified_timestamp(e.get('uploadDate')),
999 'filesize': float_or_none(e.get('contentSize')),
1000 'tbr': int_or_none(e.get('bitrate')),
1001 'width': int_or_none(e.get('width')),
1002 'height': int_or_none(e.get('height')),
1003 })
1004 break
4ca2a3cf
S
1005 return dict((k, v) for k, v in info.items() if v is not None)
1006
27713812 1007 @staticmethod
f8da79f8 1008 def _hidden_inputs(html):
586f1cc5 1009 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1010 hidden_inputs = {}
c8498368
S
1011 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1012 attrs = extract_attributes(input)
1013 if not input:
201ea3ee 1014 continue
c8498368 1015 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1016 continue
c8498368
S
1017 name = attrs.get('name') or attrs.get('id')
1018 value = attrs.get('value')
1019 if name and value is not None:
1020 hidden_inputs[name] = value
201ea3ee 1021 return hidden_inputs
27713812 1022
cf61d96d
S
1023 def _form_hidden_inputs(self, form_id, html):
1024 form = self._search_regex(
73eb13df 1025 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1026 html, '%s form' % form_id, group='form')
1027 return self._hidden_inputs(form)
1028
3ded7bac 1029 def _sort_formats(self, formats, field_preference=None):
7e8caf30 1030 if not formats:
f1a9d64e 1031 raise ExtractorError('No video formats found')
7e8caf30 1032
b0d21ded
S
1033 for f in formats:
1034 # Automatically determine tbr when missing based on abr and vbr (improves
1035 # formats sorting in some cases)
350cf045 1036 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
1037 f['tbr'] = f['abr'] + f['vbr']
1038
4bcc7bd1 1039 def _formats_key(f):
e6812ac9
PH
1040 # TODO remove the following workaround
1041 from ..utils import determine_ext
1042 if not f.get('ext') and 'url' in f:
1043 f['ext'] = determine_ext(f['url'])
1044
3ded7bac 1045 if isinstance(field_preference, (list, tuple)):
bf8dd790
S
1046 return tuple(
1047 f.get(field)
1048 if f.get(field) is not None
1049 else ('' if field == 'format_id' else -1)
1050 for field in field_preference)
3ded7bac 1051
4bcc7bd1
PH
1052 preference = f.get('preference')
1053 if preference is None:
d497a201 1054 preference = 0
4bcc7bd1
PH
1055 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1056 preference -= 0.5
1057
8b408545
RA
1058 protocol = f.get('protocol') or determine_protocol(f)
1059 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
d497a201 1060
4bcc7bd1 1061 if f.get('vcodec') == 'none': # audio only
dd867805 1062 preference -= 50
4bcc7bd1 1063 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1064 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 1065 else:
f1a9d64e 1066 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
1067 ext_preference = 0
1068 try:
1069 audio_ext_preference = ORDER.index(f['ext'])
1070 except ValueError:
1071 audio_ext_preference = -1
1072 else:
dd867805 1073 if f.get('acodec') == 'none': # video only
1074 preference -= 40
4bcc7bd1 1075 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1076 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 1077 else:
f1a9d64e 1078 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
1079 try:
1080 ext_preference = ORDER.index(f['ext'])
1081 except ValueError:
1082 ext_preference = -1
1083 audio_ext_preference = 0
1084
1085 return (
1086 preference,
aff2f4f4 1087 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 1088 f.get('quality') if f.get('quality') is not None else -1,
9933b574 1089 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 1090 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 1091 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
1092 f.get('height') if f.get('height') is not None else -1,
1093 f.get('width') if f.get('width') is not None else -1,
d497a201 1094 proto_preference,
1e1896f2 1095 ext_preference,
4bcc7bd1
PH
1096 f.get('abr') if f.get('abr') is not None else -1,
1097 audio_ext_preference,
2c8e03d9 1098 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 1099 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 1100 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 1101 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
1102 )
1103 formats.sort(key=_formats_key)
59040888 1104
96a53167
S
1105 def _check_formats(self, formats, video_id):
1106 if formats:
1107 formats[:] = filter(
1108 lambda f: self._is_valid_url(
1109 f['url'], video_id,
1110 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1111 formats)
1112
f5bdb444
S
1113 @staticmethod
1114 def _remove_duplicate_formats(formats):
1115 format_urls = set()
1116 unique_formats = []
1117 for f in formats:
1118 if f['url'] not in format_urls:
1119 format_urls.add(f['url'])
1120 unique_formats.append(f)
1121 formats[:] = unique_formats
1122
45024183 1123 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1124 url = self._proto_relative_url(url, scheme='http:')
1125 # For now assume non HTTP(S) URLs always valid
1126 if not (url.startswith('http://') or url.startswith('https://')):
1127 return True
96a53167 1128 try:
45024183 1129 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167
S
1130 return True
1131 except ExtractorError as e:
943a1e24 1132 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
1133 self.to_screen(
1134 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
1135 return False
1136 raise
1137
20991253 1138 def http_scheme(self):
1ede5b24 1139 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1140 return (
1141 'http:'
1142 if self._downloader.params.get('prefer_insecure', False)
1143 else 'https:')
1144
57c7411f
PH
1145 def _proto_relative_url(self, url, scheme=None):
1146 if url is None:
1147 return url
1148 if url.startswith('//'):
1149 if scheme is None:
1150 scheme = self.http_scheme()
1151 return scheme + url
1152 else:
1153 return url
1154
4094b6e3
PH
1155 def _sleep(self, timeout, video_id, msg_template=None):
1156 if msg_template is None:
f1a9d64e 1157 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1158 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1159 self.to_screen(msg)
1160 time.sleep(timeout)
1161
a38436e8 1162 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310 1163 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1164 fatal=True, m3u8_id=None):
f036a632
JMF
1165 manifest = self._download_xml(
1166 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1167 'Unable to download f4m manifest',
1168 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1169 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
1170 transform_source=transform_source,
1171 fatal=fatal)
1172
1173 if manifest is False:
8d29e47f 1174 return []
31bb8d3f 1175
0fdbb332
S
1176 return self._parse_f4m_formats(
1177 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
448bb5f3 1178 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332
S
1179
1180 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1181 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1182 fatal=True, m3u8_id=None):
fb72ec58 1183 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1184 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1185 if akamai_pv is not None and ';' in akamai_pv.text:
1186 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1187 if playerVerificationChallenge.strip() != '':
1188 return []
1189
31bb8d3f 1190 formats = []
7a47d07c 1191 manifest_version = '1.0'
b2527359 1192 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1193 if not media_nodes:
7a47d07c 1194 manifest_version = '2.0'
34e48bed 1195 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762
S
1196 # Remove unsupported DRM protected media from final formats
1197 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1198 media_nodes = remove_encrypted_media(media_nodes)
1199 if not media_nodes:
1200 return formats
019839fa
S
1201 base_url = xpath_text(
1202 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1203 'base URL', default=None)
1204 if base_url:
1205 base_url = base_url.strip()
0a5685b2 1206
a6571f10 1207 bootstrap_info = xpath_element(
0a5685b2
YCH
1208 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1209 'bootstrap info', default=None)
1210
edd6074c
RA
1211 vcodec = None
1212 mime_type = xpath_text(
1213 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1214 'base URL', default=None)
1215 if mime_type and mime_type.startswith('audio/'):
1216 vcodec = 'none'
1217
b2527359 1218 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1219 tbr = int_or_none(media_el.attrib.get('bitrate'))
1220 width = int_or_none(media_el.attrib.get('width'))
1221 height = int_or_none(media_el.attrib.get('height'))
1222 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1223 # If <bootstrapInfo> is present, the specified f4m is a
1224 # stream-level manifest, and only set-level manifests may refer to
1225 # external resources. See section 11.4 and section 4 of F4M spec
1226 if bootstrap_info is None:
1227 media_url = None
1228 # @href is introduced in 2.0, see section 11.6 of F4M spec
1229 if manifest_version == '2.0':
1230 media_url = media_el.attrib.get('href')
1231 if media_url is None:
1232 media_url = media_el.attrib.get('url')
31c746e5
S
1233 if not media_url:
1234 continue
cc357c4d
S
1235 manifest_url = (
1236 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 1237 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1238 # If media_url is itself a f4m manifest do the recursive extraction
1239 # since bitrates in parent manifest (this one) and media_url manifest
1240 # may differ leading to inability to resolve the format by requested
1241 # bitrate in f4m downloader
240b6045
YCH
1242 ext = determine_ext(manifest_url)
1243 if ext == 'f4m':
77b8b4e6 1244 f4m_formats = self._extract_f4m_formats(
0fdbb332 1245 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
77b8b4e6
S
1246 transform_source=transform_source, fatal=fatal)
1247 # Sometimes stream-level manifest contains single media entry that
1248 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1249 # At the same time parent's media entry in set-level manifest may
1250 # contain it. We will copy it from parent in such cases.
1251 if len(f4m_formats) == 1:
1252 f = f4m_formats[0]
1253 f.update({
1254 'tbr': f.get('tbr') or tbr,
1255 'width': f.get('width') or width,
1256 'height': f.get('height') or height,
1257 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1258 'vcodec': vcodec,
77b8b4e6
S
1259 })
1260 formats.extend(f4m_formats)
70f0f5a8 1261 continue
240b6045
YCH
1262 elif ext == 'm3u8':
1263 formats.extend(self._extract_m3u8_formats(
1264 manifest_url, video_id, 'mp4', preference=preference,
fac2af3c 1265 m3u8_id=m3u8_id, fatal=fatal))
240b6045 1266 continue
31bb8d3f 1267 formats.append({
77b8b4e6 1268 'format_id': format_id,
31bb8d3f 1269 'url': manifest_url,
30d0b549 1270 'manifest_url': manifest_url,
a6571f10 1271 'ext': 'flv' if bootstrap_info is not None else None,
b2527359 1272 'tbr': tbr,
77b8b4e6
S
1273 'width': width,
1274 'height': height,
edd6074c 1275 'vcodec': vcodec,
60ca389c 1276 'preference': preference,
31bb8d3f 1277 })
31bb8d3f
JMF
1278 return formats
1279
16da9bbc
YCH
1280 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1281 return {
f207019c 1282 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1283 'url': m3u8_url,
1284 'ext': ext,
1285 'protocol': 'm3u8',
37768f92 1286 'preference': preference - 100 if preference else -100,
704df56d
PH
1287 'resolution': 'multiple',
1288 'format_note': 'Quality selection URL',
16da9bbc
YCH
1289 }
1290
1291 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1292 entry_protocol='m3u8', preference=None,
1293 m3u8_id=None, note=None, errnote=None,
1294 fatal=True, live=False):
1295
dbd82a1d 1296 res = self._download_webpage_handle(
81515ad9 1297 m3u8_url, video_id,
621ed9f5 1298 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1299 errnote=errnote or 'Failed to download m3u8 information',
1300 fatal=fatal)
dbd82a1d 1301 if res is False:
8d29e47f 1302 return []
dbd82a1d 1303 m3u8_doc, urlh = res
37113045 1304 m3u8_url = urlh.geturl()
9cdffeeb 1305
08a00eef
RA
1306 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1307 return []
1308
0def7587
RA
1309 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1310
1311 format_url = lambda u: (
1312 u
1313 if re.match(r'^https?://', u)
1314 else compat_urlparse.urljoin(m3u8_url, u))
1315
9cdffeeb
S
1316 # We should try extracting formats only from master playlists [1], i.e.
1317 # playlists that describe available qualities. On the other hand media
1318 # playlists [2] should be returned as is since they contain just the media
1319 # without qualities renditions.
1320 # Fortunately, master playlist can be easily distinguished from media
1321 # playlist based on particular tags availability. As of [1, 2] master
1322 # playlist tags MUST NOT appear in a media playist and vice versa.
1323 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1324 # and MUST NOT appear in master playlist thus we can clearly detect media
1325 # playlist with this criterion.
1326 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1327 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1328 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1329 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1330 return [{
1331 'url': m3u8_url,
1332 'format_id': m3u8_id,
1333 'ext': ext,
1334 'protocol': entry_protocol,
1335 'preference': preference,
1336 }]
a0758836 1337 audio_in_video_stream = {}
e816c9d1
S
1338 last_info = {}
1339 last_media = {}
704df56d
PH
1340 for line in m3u8_doc.splitlines():
1341 if line.startswith('#EXT-X-STREAM-INF:'):
e154c651 1342 last_info = parse_m3u8_attributes(line)
4cd95bcb 1343 elif line.startswith('#EXT-X-MEDIA:'):
f39ffc58
RA
1344 media = parse_m3u8_attributes(line)
1345 media_type = media.get('TYPE')
1346 if media_type in ('VIDEO', 'AUDIO'):
a0758836 1347 group_id = media.get('GROUP-ID')
f39ffc58
RA
1348 media_url = media.get('URI')
1349 if media_url:
1350 format_id = []
a0758836 1351 for v in (group_id, media.get('NAME')):
f39ffc58
RA
1352 if v:
1353 format_id.append(v)
8821a718 1354 f = {
f39ffc58
RA
1355 'format_id': '-'.join(format_id),
1356 'url': format_url(media_url),
1357 'language': media.get('LANGUAGE'),
f39ffc58
RA
1358 'ext': ext,
1359 'protocol': entry_protocol,
1360 'preference': preference,
8821a718
RA
1361 }
1362 if media_type == 'AUDIO':
1363 f['vcodec'] = 'none'
a0758836
RA
1364 if group_id and not audio_in_video_stream.get(group_id):
1365 audio_in_video_stream[group_id] = False
8821a718 1366 formats.append(f)
9250181f
S
1367 else:
1368 # When there is no URI in EXT-X-MEDIA let this tag's
1369 # data be used by regular URI lines below
1370 last_media = media
a0758836
RA
1371 if media_type == 'AUDIO' and group_id:
1372 audio_in_video_stream[group_id] = True
704df56d
PH
1373 elif line.startswith('#') or not line.strip():
1374 continue
1375 else:
f39ffc58 1376 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1377 format_id = []
1378 if m3u8_id:
1379 format_id.append(m3u8_id)
9250181f
S
1380 # Despite specification does not mention NAME attribute for
1381 # EXT-X-STREAM-INF it still sometimes may be present
e816c9d1 1382 stream_name = last_info.get('NAME') or last_media.get('NAME')
b24d6336
KH
1383 # Bandwidth of live streams may differ over time thus making
1384 # format_id unpredictable. So it's better to keep provided
1385 # format_id intact.
e9c6cdf4 1386 if not live:
ed56f260 1387 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
30d0b549 1388 manifest_url = format_url(line.strip())
704df56d 1389 f = {
8dc9d361 1390 'format_id': '-'.join(format_id),
30d0b549
S
1391 'url': manifest_url,
1392 'manifest_url': manifest_url,
704df56d
PH
1393 'tbr': tbr,
1394 'ext': ext,
00f4764c 1395 'fps': float_or_none(last_info.get('FRAME-RATE')),
f0b5d6af
PH
1396 'protocol': entry_protocol,
1397 'preference': preference,
704df56d 1398 }
704df56d
PH
1399 resolution = last_info.get('RESOLUTION')
1400 if resolution:
c4c9b844
S
1401 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1402 if mobj:
1403 f['width'] = int(mobj.group('width'))
1404 f['height'] = int(mobj.group('height'))
00f4764c
RA
1405 # Unified Streaming Platform
1406 mobj = re.search(
1407 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1408 if mobj:
1409 abr, vbr = mobj.groups()
1410 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
fbb6edd2 1411 f.update({
00f4764c
RA
1412 'vbr': vbr,
1413 'abr': abr,
fbb6edd2 1414 })
00f4764c 1415 f.update(parse_codecs(last_info.get('CODECS')))
242a14a1
S
1416 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1417 # TODO: update acodec for audio only formats with the same GROUP-ID
8821a718 1418 f['acodec'] = 'none'
704df56d
PH
1419 formats.append(f)
1420 last_info = {}
e816c9d1 1421 last_media = {}
704df56d
PH
1422 return formats
1423
a107193e
S
1424 @staticmethod
1425 def _xpath_ns(path, namespace=None):
1426 if not namespace:
1427 return path
1428 out = []
1429 for c in path.split('/'):
1430 if not c or c == '.':
1431 out.append(c)
1432 else:
1433 out.append('{%s}%s' % (namespace, c))
1434 return '/'.join(out)
1435
09f572fb 1436 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1437 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1438
995029a1
PH
1439 if smil is False:
1440 assert not fatal
1441 return []
e89a2aab 1442
17712eeb 1443 namespace = self._parse_smil_namespace(smil)
a107193e
S
1444
1445 return self._parse_smil_formats(
1446 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1447
1448 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1449 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1450 if smil is False:
1451 return {}
1452 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1453
09f572fb 1454 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1455 return self._download_xml(
1456 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1457 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1458
1459 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1460 namespace = self._parse_smil_namespace(smil)
a107193e
S
1461
1462 formats = self._parse_smil_formats(
1463 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1464 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1465
1466 video_id = os.path.splitext(url_basename(smil_url))[0]
1467 title = None
1468 description = None
647eab45 1469 upload_date = None
a107193e
S
1470 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1471 name = meta.attrib.get('name')
1472 content = meta.attrib.get('content')
1473 if not name or not content:
1474 continue
1475 if not title and name == 'title':
1476 title = content
1477 elif not description and name in ('description', 'abstract'):
1478 description = content
647eab45
S
1479 elif not upload_date and name == 'date':
1480 upload_date = unified_strdate(content)
a107193e 1481
1e5bcdec
S
1482 thumbnails = [{
1483 'id': image.get('type'),
1484 'url': image.get('src'),
1485 'width': int_or_none(image.get('width')),
1486 'height': int_or_none(image.get('height')),
1487 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1488
a107193e
S
1489 return {
1490 'id': video_id,
1491 'title': title or video_id,
1492 'description': description,
647eab45 1493 'upload_date': upload_date,
1e5bcdec 1494 'thumbnails': thumbnails,
a107193e
S
1495 'formats': formats,
1496 'subtitles': subtitles,
1497 }
1498
17712eeb
S
1499 def _parse_smil_namespace(self, smil):
1500 return self._search_regex(
1501 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1502
f877c6ae 1503 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1504 base = smil_url
1505 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1506 b = meta.get('base') or meta.get('httpBase')
1507 if b:
1508 base = b
1509 break
e89a2aab
S
1510
1511 formats = []
1512 rtmp_count = 0
a107193e 1513 http_count = 0
7f32e5dc 1514 m3u8_count = 0
a107193e 1515
81e1c4e2 1516 srcs = []
ad96b4c8
YCH
1517 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1518 for medium in media:
1519 src = medium.get('src')
81e1c4e2 1520 if not src or src in srcs:
a107193e 1521 continue
81e1c4e2 1522 srcs.append(src)
a107193e 1523
ad96b4c8
YCH
1524 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1525 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1526 width = int_or_none(medium.get('width'))
1527 height = int_or_none(medium.get('height'))
1528 proto = medium.get('proto')
1529 ext = medium.get('ext')
a107193e 1530 src_ext = determine_ext(src)
ad96b4c8 1531 streamer = medium.get('streamer') or base
a107193e
S
1532
1533 if proto == 'rtmp' or streamer.startswith('rtmp'):
1534 rtmp_count += 1
1535 formats.append({
1536 'url': streamer,
1537 'play_path': src,
1538 'ext': 'flv',
1539 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1540 'tbr': bitrate,
1541 'filesize': filesize,
1542 'width': width,
1543 'height': height,
1544 })
f877c6ae
YCH
1545 if transform_rtmp_url:
1546 streamer, src = transform_rtmp_url(streamer, src)
1547 formats[-1].update({
1548 'url': streamer,
1549 'play_path': src,
1550 })
a107193e
S
1551 continue
1552
1553 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1554 src_url = src_url.strip()
a107193e
S
1555
1556 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1557 m3u8_formats = self._extract_m3u8_formats(
1558 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1559 if len(m3u8_formats) == 1:
1560 m3u8_count += 1
1561 m3u8_formats[0].update({
1562 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1563 'tbr': bitrate,
1564 'width': width,
1565 'height': height,
1566 })
1567 formats.extend(m3u8_formats)
a107193e
S
1568 continue
1569
1570 if src_ext == 'f4m':
1571 f4m_url = src_url
1572 if not f4m_params:
1573 f4m_params = {
1574 'hdcore': '3.2.0',
1575 'plugin': 'flowplayer-3.2.0.1',
1576 }
1577 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 1578 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 1579 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1580 continue
1581
c78e4817 1582 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1583 http_count += 1
1584 formats.append({
1585 'url': src_url,
1586 'ext': ext or src_ext or 'flv',
1587 'format_id': 'http-%d' % (bitrate or http_count),
1588 'tbr': bitrate,
1589 'filesize': filesize,
1590 'width': width,
1591 'height': height,
1592 })
1593 continue
63757032 1594
e89a2aab
S
1595 return formats
1596
ce00af87 1597 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1598 urls = []
a107193e
S
1599 subtitles = {}
1600 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1601 src = textstream.get('src')
d413095f 1602 if not src or src in urls:
a107193e 1603 continue
d413095f 1604 urls.append(src)
df634be2 1605 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 1606 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1607 subtitles.setdefault(lang, []).append({
1608 'url': src,
1609 'ext': ext,
1610 })
1611 return subtitles
63757032 1612
942acef5
S
1613 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1614 xspf = self._download_xml(
8d6765cf 1615 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1616 'Unable to download xspf manifest', fatal=fatal)
1617 if xspf is False:
1618 return []
1619 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1620
942acef5 1621 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1622 NS_MAP = {
1623 'xspf': 'http://xspf.org/ns/0/',
1624 's1': 'http://static.streamone.nl/player/ns/0',
1625 }
1626
1627 entries = []
1628 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1629 title = xpath_text(
98044462 1630 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1631 description = xpath_text(
1632 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1633 thumbnail = xpath_text(
1634 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1635 duration = float_or_none(
1636 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1637
1638 formats = [{
1639 'url': location.text,
1640 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1641 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1642 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1643 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1644 self._sort_formats(formats)
1645
1646 entries.append({
1647 'id': playlist_id,
1648 'title': title,
1649 'description': description,
1650 'thumbnail': thumbnail,
1651 'duration': duration,
1652 'formats': formats,
1653 })
1654 return entries
1655
1bac3455 1656 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1657 res = self._download_webpage_handle(
1658 mpd_url, video_id,
1659 note=note or 'Downloading MPD manifest',
1660 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1661 fatal=fatal)
1bac3455 1662 if res is False:
2d2fa82d 1663 return []
1bac3455 1664 mpd, urlh = res
02dc0a36 1665 mpd_base_url = base_url(urlh.geturl())
1bac3455 1666
91cb6b50 1667 return self._parse_mpd_formats(
86f4d14f
S
1668 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1669 formats_dict=formats_dict, mpd_url=mpd_url)
2d2fa82d 1670
86f4d14f 1671 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
f0948348
S
1672 """
1673 Parse formats from MPD manifest.
1674 References:
1675 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1676 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1677 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1678 """
1bac3455 1679 if mpd_doc.get('type') == 'dynamic':
1680 return []
2d2fa82d 1681
91cb6b50 1682 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1683
1684 def _add_ns(path):
1685 return self._xpath_ns(path, namespace)
1686
675d0016 1687 def is_drm_protected(element):
1688 return element.find(_add_ns('ContentProtection')) is not None
1689
1bac3455 1690 def extract_multisegment_info(element, ms_parent_info):
1691 ms_info = ms_parent_info.copy()
b4c1d6e8
S
1692
1693 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1694 # common attributes and elements. We will only extract relevant
1695 # for us.
1696 def extract_common(source):
1697 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1698 if segment_timeline is not None:
1699 s_e = segment_timeline.findall(_add_ns('S'))
1700 if s_e:
1701 ms_info['total_number'] = 0
1702 ms_info['s'] = []
1703 for s in s_e:
1704 r = int(s.get('r', 0))
1705 ms_info['total_number'] += 1 + r
1706 ms_info['s'].append({
1707 't': int(s.get('t', 0)),
1708 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1709 'd': int(s.attrib['d']),
1710 'r': r,
1711 })
1712 start_number = source.get('startNumber')
1713 if start_number:
1714 ms_info['start_number'] = int(start_number)
1715 timescale = source.get('timescale')
1716 if timescale:
1717 ms_info['timescale'] = int(timescale)
1718 segment_duration = source.get('duration')
1719 if segment_duration:
1720 ms_info['segment_duration'] = int(segment_duration)
1721
1722 def extract_Initialization(source):
1723 initialization = source.find(_add_ns('Initialization'))
1724 if initialization is not None:
1725 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1726
f14be228 1727 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1728 if segment_list is not None:
b4c1d6e8
S
1729 extract_common(segment_list)
1730 extract_Initialization(segment_list)
f14be228 1731 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1732 if segment_urls_e:
1733 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 1734 else:
f14be228 1735 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1736 if segment_template is not None:
b4c1d6e8 1737 extract_common(segment_template)
e228616c
S
1738 media = segment_template.get('media')
1739 if media:
1740 ms_info['media'] = media
1bac3455 1741 initialization = segment_template.get('initialization')
1742 if initialization:
e228616c 1743 ms_info['initialization'] = initialization
1bac3455 1744 else:
b4c1d6e8 1745 extract_Initialization(segment_template)
1bac3455 1746 return ms_info
b323e170 1747
1bac3455 1748 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1749 formats = []
f14be228 1750 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1751 period_duration = parse_duration(period.get('duration')) or mpd_duration
1752 period_ms_info = extract_multisegment_info(period, {
1753 'start_number': 1,
1754 'timescale': 1,
1755 })
f14be228 1756 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1757 if is_drm_protected(adaptation_set):
1758 continue
1bac3455 1759 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1760 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1761 if is_drm_protected(representation):
1762 continue
1bac3455 1763 representation_attrib = adaptation_set.attrib.copy()
1764 representation_attrib.update(representation.attrib)
f0948348 1765 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
1766 mime_type = representation_attrib['mimeType']
1767 content_type = mime_type.split('/')[0]
1bac3455 1768 if content_type == 'text':
1769 # TODO implement WebVTT downloading
1770 pass
1771 elif content_type == 'video' or content_type == 'audio':
1772 base_url = ''
1773 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1774 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1775 if base_url_e is not None:
1776 base_url = base_url_e.text + base_url
1777 if re.match(r'^https?://', base_url):
1778 break
bb20526b
S
1779 if mpd_base_url and not re.match(r'^https?://', base_url):
1780 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1781 mpd_base_url += '/'
1bac3455 1782 base_url = mpd_base_url + base_url
1783 representation_id = representation_attrib.get('id')
d577c796 1784 lang = representation_attrib.get('lang')
51e9094f 1785 url_el = representation.find(_add_ns('BaseURL'))
1786 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 1787 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 1788 f = {
154c209e 1789 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 1790 'url': base_url,
86f4d14f 1791 'manifest_url': mpd_url,
a6c8b759 1792 'ext': mimetype2ext(mime_type),
1bac3455 1793 'width': int_or_none(representation_attrib.get('width')),
1794 'height': int_or_none(representation_attrib.get('height')),
e228616c 1795 'tbr': int_or_none(bandwidth, 1000),
1bac3455 1796 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1797 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 1798 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 1799 'format_note': 'DASH %s' % content_type,
51e9094f 1800 'filesize': filesize,
1bac3455 1801 }
7fe15920 1802 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 1803 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 1804
e228616c
S
1805 def prepare_template(template_name, identifiers):
1806 t = representation_ms_info[template_name]
1807 t = t.replace('$RepresentationID$', representation_id)
1808 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1809 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1810 t.replace('$$', '$')
1811 return t
1812
1813 # @initialization is a regular template like @media one
1814 # so it should be handled just the same way (see
1815 # https://github.com/rg3/youtube-dl/issues/11605)
1816 if 'initialization' in representation_ms_info:
1817 initialization_template = prepare_template(
1818 'initialization',
1819 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1820 # $Time$ shall not be included for @initialization thus
1821 # only $Bandwidth$ remains
1822 ('Bandwidth', ))
1823 representation_ms_info['initialization_url'] = initialization_template % {
1824 'Bandwidth': bandwidth,
1825 }
1826
1827 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1828
1829 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
f0948348
S
1830
1831 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1832 # can't be used at the same time
b4c1d6e8
S
1833 if '%(Number' in media_template and 's' not in representation_ms_info:
1834 segment_duration = None
1835 if 'total_number' not in representation_ms_info and 'segment_duration':
1836 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1837 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8
S
1838 representation_ms_info['fragments'] = [{
1839 'url': media_template % {
1840 'Number': segment_number,
e228616c 1841 'Bandwidth': bandwidth,
b4c1d6e8
S
1842 },
1843 'duration': segment_duration,
1844 } for segment_number in range(
1845 representation_ms_info['start_number'],
1846 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 1847 else:
b4c1d6e8
S
1848 # $Number*$ or $Time$ in media template with S list available
1849 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1850 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 1851 representation_ms_info['fragments'] = []
f0948348 1852 segment_time = 0
b4c1d6e8
S
1853 segment_d = None
1854 segment_number = representation_ms_info['start_number']
f0948348
S
1855
1856 def add_segment_url():
b4c1d6e8
S
1857 segment_url = media_template % {
1858 'Time': segment_time,
e228616c 1859 'Bandwidth': bandwidth,
b4c1d6e8
S
1860 'Number': segment_number,
1861 }
b4c1d6e8
S
1862 representation_ms_info['fragments'].append({
1863 'url': segment_url,
1864 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1865 })
f0948348
S
1866
1867 for num, s in enumerate(representation_ms_info['s']):
1868 segment_time = s.get('t') or segment_time
b4c1d6e8 1869 segment_d = s['d']
f0948348 1870 add_segment_url()
b4c1d6e8 1871 segment_number += 1
f0948348 1872 for r in range(s.get('r', 0)):
b4c1d6e8 1873 segment_time += segment_d
f0948348 1874 add_segment_url()
b4c1d6e8
S
1875 segment_number += 1
1876 segment_time += segment_d
1877 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1878 # No media template
1879 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1880 # or any YouTube dashsegments video
1881 fragments = []
d04621da
S
1882 segment_index = 0
1883 timescale = representation_ms_info['timescale']
1884 for s in representation_ms_info['s']:
1885 duration = float_or_none(s['d'], timescale)
b4c1d6e8
S
1886 for r in range(s.get('r', 0) + 1):
1887 fragments.append({
d04621da
S
1888 'url': representation_ms_info['segment_urls'][segment_index],
1889 'duration': duration,
b4c1d6e8 1890 })
d04621da 1891 segment_index += 1
b4c1d6e8 1892 representation_ms_info['fragments'] = fragments
86f4d14f
S
1893 # NB: MPD manifest may contain direct URLs to unfragmented media.
1894 # No fragments key is present in this case.
1895 if 'fragments' in representation_ms_info:
1bac3455 1896 f.update({
b4c1d6e8 1897 'fragments': [],
1bac3455 1898 'protocol': 'http_dash_segments',
df374b52 1899 })
1bac3455 1900 if 'initialization_url' in representation_ms_info:
e228616c 1901 initialization_url = representation_ms_info['initialization_url']
1bac3455 1902 if not f.get('url'):
1903 f['url'] = initialization_url
b4c1d6e8
S
1904 f['fragments'].append({'url': initialization_url})
1905 f['fragments'].extend(representation_ms_info['fragments'])
1906 for fragment in f['fragments']:
7fe15920 1907 fragment['url'] = urljoin(base_url, fragment['url'])
1bac3455 1908 try:
1909 existing_format = next(
1910 fo for fo in formats
1911 if fo['format_id'] == representation_id)
1912 except StopIteration:
1913 full_info = formats_dict.get(representation_id, {}).copy()
1914 full_info.update(f)
1915 formats.append(full_info)
1916 else:
1917 existing_format.update(f)
17b598d3 1918 else:
1bac3455 1919 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
1920 return formats
1921
b2758123
RA
1922 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1923 res = self._download_webpage_handle(
1924 ism_url, video_id,
1925 note=note or 'Downloading ISM manifest',
1926 errnote=errnote or 'Failed to download ISM manifest',
1927 fatal=fatal)
1928 if res is False:
1929 return []
1930 ism, urlh = res
1931
1932 return self._parse_ism_formats(
1933 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1934
1935 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1936 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1937 return []
1938
b2758123
RA
1939 duration = int(ism_doc.attrib['Duration'])
1940 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1941
1942 formats = []
1943 for stream in ism_doc.findall('StreamIndex'):
1944 stream_type = stream.get('Type')
1945 if stream_type not in ('video', 'audio'):
1946 continue
1947 url_pattern = stream.attrib['Url']
1948 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1949 stream_name = stream.get('Name')
1950 for track in stream.findall('QualityLevel'):
1951 fourcc = track.get('FourCC')
1952 # TODO: add support for WVC1 and WMAP
1953 if fourcc not in ('H264', 'AVC1', 'AACL'):
1954 self.report_warning('%s is not a supported codec' % fourcc)
1955 continue
1956 tbr = int(track.attrib['Bitrate']) // 1000
1957 width = int_or_none(track.get('MaxWidth'))
1958 height = int_or_none(track.get('MaxHeight'))
1959 sampling_rate = int_or_none(track.get('SamplingRate'))
1960
1961 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1962 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1963
1964 fragments = []
1965 fragment_ctx = {
1966 'time': 0,
1967 }
1968 stream_fragments = stream.findall('c')
1969 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1970 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1971 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1972 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1973 if not fragment_ctx['duration']:
1974 try:
1975 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1976 except IndexError:
1977 next_fragment_time = duration
1616f9b4 1978 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
1979 for _ in range(fragment_repeat):
1980 fragments.append({
1616f9b4 1981 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
1982 'duration': fragment_ctx['duration'] / stream_timescale,
1983 })
1984 fragment_ctx['time'] += fragment_ctx['duration']
1985
1986 format_id = []
1987 if ism_id:
1988 format_id.append(ism_id)
1989 if stream_name:
1990 format_id.append(stream_name)
1991 format_id.append(compat_str(tbr))
1992
1993 formats.append({
1994 'format_id': '-'.join(format_id),
1995 'url': ism_url,
1996 'manifest_url': ism_url,
1997 'ext': 'ismv' if stream_type == 'video' else 'isma',
1998 'width': width,
1999 'height': height,
2000 'tbr': tbr,
2001 'asr': sampling_rate,
2002 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2003 'acodec': 'none' if stream_type == 'video' else fourcc,
2004 'protocol': 'ism',
2005 'fragments': fragments,
2006 '_download_params': {
2007 'duration': duration,
2008 'timescale': stream_timescale,
2009 'width': width or 0,
2010 'height': height or 0,
2011 'fourcc': fourcc,
2012 'codec_private_data': track.get('CodecPrivateData'),
2013 'sampling_rate': sampling_rate,
2014 'channels': int_or_none(track.get('Channels', 2)),
2015 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2016 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2017 },
2018 })
2019 return formats
2020
eeb0a956 2021 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
59bbe491 2022 def absolute_url(video_url):
2023 return compat_urlparse.urljoin(base_url, video_url)
2024
2025 def parse_content_type(content_type):
2026 if not content_type:
2027 return {}
2028 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2029 if ctr:
2030 mimetype, codecs = ctr.groups()
2031 f = parse_codecs(codecs)
2032 f['ext'] = mimetype2ext(mimetype)
2033 return f
2034 return {}
2035
520251c0
YCH
2036 def _media_formats(src, cur_media_type):
2037 full_url = absolute_url(src)
87a449c1
S
2038 ext = determine_ext(full_url)
2039 if ext == 'm3u8':
520251c0
YCH
2040 is_plain_url = False
2041 formats = self._extract_m3u8_formats(
ad120ae1 2042 full_url, video_id, ext='mp4',
eeb0a956
YCH
2043 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2044 preference=preference)
87a449c1
S
2045 elif ext == 'mpd':
2046 is_plain_url = False
2047 formats = self._extract_mpd_formats(
2048 full_url, video_id, mpd_id=mpd_id)
520251c0
YCH
2049 else:
2050 is_plain_url = True
2051 formats = [{
2052 'url': full_url,
2053 'vcodec': 'none' if cur_media_type == 'audio' else None,
2054 }]
2055 return is_plain_url, formats
2056
59bbe491 2057 entries = []
cea364f7
YCH
2058 media_tags = [(media_tag, media_type, '')
2059 for media_tag, media_type
2060 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2aec7256
S
2061 media_tags.extend(re.findall(
2062 # We only allow video|audio followed by a whitespace or '>'.
2063 # Allowing more characters may end up in significant slow down (see
2064 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2065 # http://www.porntrex.com/maps/videositemap.xml).
2066 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
cea364f7 2067 for media_tag, media_type, media_content in media_tags:
59bbe491 2068 media_info = {
2069 'formats': [],
2070 'subtitles': {},
2071 }
2072 media_attributes = extract_attributes(media_tag)
2073 src = media_attributes.get('src')
2074 if src:
dedb1770 2075 _, formats = _media_formats(src, media_type)
520251c0 2076 media_info['formats'].extend(formats)
59bbe491 2077 media_info['thumbnail'] = media_attributes.get('poster')
2078 if media_content:
2079 for source_tag in re.findall(r'<source[^>]+>', media_content):
2080 source_attributes = extract_attributes(source_tag)
2081 src = source_attributes.get('src')
2082 if not src:
2083 continue
520251c0
YCH
2084 is_plain_url, formats = _media_formats(src, media_type)
2085 if is_plain_url:
2086 f = parse_content_type(source_attributes.get('type'))
2087 f.update(formats[0])
2088 media_info['formats'].append(f)
2089 else:
2090 media_info['formats'].extend(formats)
59bbe491 2091 for track_tag in re.findall(r'<track[^>]+>', media_content):
2092 track_attributes = extract_attributes(track_tag)
2093 kind = track_attributes.get('kind')
5968d7d2 2094 if not kind or kind in ('subtitles', 'captions'):
59bbe491 2095 src = track_attributes.get('src')
2096 if not src:
2097 continue
2098 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2099 media_info['subtitles'].setdefault(lang, []).append({
2100 'url': absolute_url(src),
2101 })
5968d7d2 2102 if media_info['formats'] or media_info['subtitles']:
59bbe491 2103 entries.append(media_info)
2104 return entries
2105
c4251b9a 2106 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
c7c43a93 2107 formats = []
e71a4509 2108 hdcore_sign = 'hdcore=3.7.0'
c4251b9a
RA
2109 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2110 hds_host = hosts.get('hds')
2111 if hds_host:
2112 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2113 if 'hdcore=' not in f4m_url:
2114 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2115 f4m_formats = self._extract_f4m_formats(
2116 f4m_url, video_id, f4m_id='hds', fatal=False)
2117 for entry in f4m_formats:
2118 entry.update({'extra_param_to_segment_url': hdcore_sign})
2119 formats.extend(f4m_formats)
c4251b9a
RA
2120 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2121 hls_host = hosts.get('hls')
2122 if hls_host:
2123 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
c7c43a93
RA
2124 formats.extend(self._extract_m3u8_formats(
2125 m3u8_url, video_id, 'mp4', 'm3u8_native',
2126 m3u8_id='hls', fatal=False))
2127 return formats
2128
6ad02195
RA
2129 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2130 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2131 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2132 http_base_url = 'http' + url_base
2133 formats = []
2134 if 'm3u8' not in skip_protocols:
2135 formats.extend(self._extract_m3u8_formats(
2136 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2137 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2138 if 'f4m' not in skip_protocols:
2139 formats.extend(self._extract_f4m_formats(
2140 http_base_url + '/manifest.f4m',
2141 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2142 if 'dash' not in skip_protocols:
2143 formats.extend(self._extract_mpd_formats(
2144 http_base_url + '/manifest.mpd',
2145 video_id, mpd_id='dash', fatal=False))
6ad02195 2146 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2147 if 'smil' not in skip_protocols:
2148 rtmp_formats = self._extract_smil_formats(
2149 http_base_url + '/jwplayer.smil',
2150 video_id, fatal=False)
2151 for rtmp_format in rtmp_formats:
2152 rtsp_format = rtmp_format.copy()
2153 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2154 del rtsp_format['play_path']
2155 del rtsp_format['ext']
2156 rtsp_format.update({
2157 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2158 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2159 'protocol': 'rtsp',
2160 })
2161 formats.extend([rtmp_format, rtsp_format])
2162 else:
2163 for protocol in ('rtmp', 'rtsp'):
2164 if protocol not in skip_protocols:
2165 formats.append({
2166 'url': protocol + url_base,
2167 'format_id': protocol,
2168 'protocol': protocol,
2169 })
2170 return formats
2171
a4a554a7
YCH
2172 @staticmethod
2173 def _find_jwplayer_data(webpage):
2174 mobj = re.search(
2175 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2176 webpage)
2177 if mobj:
2178 return mobj.group('options')
2179
2180 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2181 jwplayer_data = self._parse_json(
2182 self._find_jwplayer_data(webpage), video_id,
2183 transform_source=js_to_json)
2184 return self._parse_jwplayer_data(
2185 jwplayer_data, video_id, *args, **kwargs)
2186
2187 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2188 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2189 # JWPlayer backward compatibility: flattened playlists
2190 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2191 if 'playlist' not in jwplayer_data:
2192 jwplayer_data = {'playlist': [jwplayer_data]}
2193
2194 entries = []
2195
2196 # JWPlayer backward compatibility: single playlist item
2197 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2198 if not isinstance(jwplayer_data['playlist'], list):
2199 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2200
2201 for video_data in jwplayer_data['playlist']:
2202 # JWPlayer backward compatibility: flattened sources
2203 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2204 if 'sources' not in video_data:
2205 video_data['sources'] = [video_data]
2206
2207 this_video_id = video_id or video_data['mediaid']
2208
1a2192cb
S
2209 formats = self._parse_jwplayer_formats(
2210 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2211 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
2212 self._sort_formats(formats)
2213
2214 subtitles = {}
2215 tracks = video_data.get('tracks')
2216 if tracks and isinstance(tracks, list):
2217 for track in tracks:
2218 if track.get('kind') != 'captions':
2219 continue
2220 track_url = urljoin(base_url, track.get('file'))
2221 if not track_url:
2222 continue
2223 subtitles.setdefault(track.get('label') or 'en', []).append({
2224 'url': self._proto_relative_url(track_url)
2225 })
2226
2227 entries.append({
2228 'id': this_video_id,
2229 'title': video_data['title'] if require_title else video_data.get('title'),
2230 'description': video_data.get('description'),
2231 'thumbnail': self._proto_relative_url(video_data.get('image')),
2232 'timestamp': int_or_none(video_data.get('pubdate')),
2233 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2234 'subtitles': subtitles,
2235 'formats': formats,
2236 })
2237 if len(entries) == 1:
2238 return entries[0]
2239 else:
2240 return self.playlist_result(entries)
2241
ed0cf9b3
S
2242 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2243 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2244 formats = []
1a2192cb 2245 for source in jwplayer_sources_data:
ed0cf9b3
S
2246 source_url = self._proto_relative_url(source['file'])
2247 if base_url:
2248 source_url = compat_urlparse.urljoin(base_url, source_url)
2249 source_type = source.get('type') or ''
2250 ext = mimetype2ext(source_type) or determine_ext(source_url)
2251 if source_type == 'hls' or ext == 'm3u8':
2252 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
2253 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2254 m3u8_id=m3u8_id, fatal=False))
ed0cf9b3
S
2255 elif ext == 'mpd':
2256 formats.extend(self._extract_mpd_formats(
2257 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
2258 elif ext == 'smil':
2259 formats.extend(self._extract_smil_formats(
2260 source_url, video_id, fatal=False))
ed0cf9b3 2261 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
2262 elif source_type.startswith('audio') or ext in (
2263 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
2264 formats.append({
2265 'url': source_url,
2266 'vcodec': 'none',
2267 'ext': ext,
2268 })
2269 else:
2270 height = int_or_none(source.get('height'))
2271 if height is None:
2272 # Often no height is provided but there is a label in
0236cd0d 2273 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 2274 height = int_or_none(self._search_regex(
0236cd0d 2275 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
2276 'height', default=None))
2277 a_format = {
2278 'url': source_url,
2279 'width': int_or_none(source.get('width')),
2280 'height': height,
0236cd0d 2281 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
2282 'ext': ext,
2283 }
2284 if source_url.startswith('rtmp'):
2285 a_format['ext'] = 'flv'
ed0cf9b3
S
2286 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2287 # of jwplayer.flash.swf
2288 rtmp_url_parts = re.split(
2289 r'((?:mp4|mp3|flv):)', source_url, 1)
2290 if len(rtmp_url_parts) == 3:
2291 rtmp_url, prefix, play_path = rtmp_url_parts
2292 a_format.update({
2293 'url': rtmp_url,
2294 'play_path': prefix + play_path,
2295 })
2296 if rtmp_params:
2297 a_format.update(rtmp_params)
2298 formats.append(a_format)
2299 return formats
2300
f4b1c7ad
PH
2301 def _live_title(self, name):
2302 """ Generate the title for a live video """
2303 now = datetime.datetime.now()
611c1dd9 2304 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
2305 return name + ' ' + now_str
2306
b14f3a4c
PH
2307 def _int(self, v, name, fatal=False, **kwargs):
2308 res = int_or_none(v, **kwargs)
2309 if 'get_attr' in kwargs:
2310 print(getattr(v, kwargs['get_attr']))
2311 if res is None:
2312 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2313 if fatal:
2314 raise ExtractorError(msg)
2315 else:
2316 self._downloader.report_warning(msg)
2317 return res
2318
2319 def _float(self, v, name, fatal=False, **kwargs):
2320 res = float_or_none(v, **kwargs)
2321 if res is None:
2322 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2323 if fatal:
2324 raise ExtractorError(msg)
2325 else:
2326 self._downloader.report_warning(msg)
2327 return res
2328
42939b61 2329 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
2330 cookie = compat_cookiejar.Cookie(
2331 0, name, value, None, None, domain, None,
42939b61
JMF
2332 None, '/', True, False, expire_time, '', None, None, None)
2333 self._downloader.cookiejar.set_cookie(cookie)
2334
799207e8 2335 def _get_cookies(self, url):
2336 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 2337 req = sanitized_Request(url)
799207e8 2338 self._downloader.cookiejar.add_cookie_header(req)
2339 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2340
05900629
PH
2341 def get_testcases(self, include_onlymatching=False):
2342 t = getattr(self, '_TEST', None)
2343 if t:
2344 assert not hasattr(self, '_TESTS'), \
2345 '%s has _TEST and _TESTS' % type(self).__name__
2346 tests = [t]
2347 else:
2348 tests = getattr(self, '_TESTS', [])
2349 for t in tests:
2350 if not include_onlymatching and t.get('only_matching', False):
2351 continue
2352 t['name'] = type(self).__name__[:-len('IE')]
2353 yield t
2354
2355 def is_suitable(self, age_limit):
2356 """ Test whether the extractor is generally suitable for the given
2357 age limit (i.e. pornographic sites are not, all others usually are) """
2358
2359 any_restricted = False
2360 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 2361 if tc.get('playlist', []):
05900629
PH
2362 tc = tc['playlist'][0]
2363 is_restricted = age_restricted(
2364 tc.get('info_dict', {}).get('age_limit'), age_limit)
2365 if not is_restricted:
2366 return True
2367 any_restricted = any_restricted or is_restricted
2368 return not any_restricted
2369
a504ced0 2370 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
2371 if (self._downloader.params.get('writesubtitles', False) or
2372 self._downloader.params.get('listsubtitles')):
2373 return self._get_subtitles(*args, **kwargs)
2374 return {}
a504ced0
JMF
2375
2376 def _get_subtitles(self, *args, **kwargs):
611c1dd9 2377 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 2378
912e0b7e
YCH
2379 @staticmethod
2380 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2381 """ Merge subtitle items for one language. Items with duplicated URLs
2382 will be dropped. """
2383 list1_urls = set([item['url'] for item in subtitle_list1])
2384 ret = list(subtitle_list1)
2385 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2386 return ret
2387
2388 @classmethod
8c97f819 2389 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 2390 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
2391 ret = dict(subtitle_dict1)
2392 for lang in subtitle_dict2:
8c97f819 2393 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
2394 return ret
2395
360e1ca5 2396 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
2397 if (self._downloader.params.get('writeautomaticsub', False) or
2398 self._downloader.params.get('listsubtitles')):
2399 return self._get_automatic_captions(*args, **kwargs)
2400 return {}
360e1ca5
JMF
2401
2402 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 2403 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 2404
d77ab8e2
S
2405 def mark_watched(self, *args, **kwargs):
2406 if (self._downloader.params.get('mark_watched', False) and
2407 (self._get_login_info()[0] is not None or
2408 self._downloader.params.get('cookiefile') is not None)):
2409 self._mark_watched(*args, **kwargs)
2410
2411 def _mark_watched(self, *args, **kwargs):
2412 raise NotImplementedError('This method must be implemented by subclasses')
2413
38cce791
YCH
2414 def geo_verification_headers(self):
2415 headers = {}
2416 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2417 if geo_verification_proxy:
2418 headers['Ytdl-request-proxy'] = geo_verification_proxy
2419 return headers
2420
98763ee3
YCH
2421 def _generic_id(self, url):
2422 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2423
2424 def _generic_title(self, url):
2425 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2426
8dbe9899 2427
d6983cb4
PH
2428class SearchInfoExtractor(InfoExtractor):
2429 """
2430 Base class for paged search queries extractors.
10952eb2 2431 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
2432 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2433 """
2434
2435 @classmethod
2436 def _make_valid_url(cls):
2437 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2438
2439 @classmethod
2440 def suitable(cls, url):
2441 return re.match(cls._make_valid_url(), url) is not None
2442
2443 def _real_extract(self, query):
2444 mobj = re.match(self._make_valid_url(), query)
2445 if mobj is None:
f1a9d64e 2446 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
2447
2448 prefix = mobj.group('prefix')
2449 query = mobj.group('query')
2450 if prefix == '':
2451 return self._get_n_results(query, 1)
2452 elif prefix == 'all':
2453 return self._get_n_results(query, self._MAX_RESULTS)
2454 else:
2455 n = int(prefix)
2456 if n <= 0:
f1a9d64e 2457 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 2458 elif n > self._MAX_RESULTS:
f1a9d64e 2459 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
2460 n = self._MAX_RESULTS
2461 return self._get_n_results(query, n)
2462
2463 def _get_n_results(self, query, n):
2464 """Get a specified number of results for a query"""
611c1dd9 2465 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
2466
2467 @property
2468 def SEARCH_KEY(self):
2469 return self._SEARCH_KEY