]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[extractors] Add vk:wallpost extractor import
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
6a3828fd 1from __future__ import unicode_literals
f1a9d64e 2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
1bac3455 13import math
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
799207e8 17 compat_cookies,
e9c0cdd3 18 compat_etree_fromstring,
e64b7569 19 compat_getpass,
d6983cb4 20 compat_http_client,
e9c0cdd3
YCH
21 compat_os_name,
22 compat_str,
d6983cb4 23 compat_urllib_error,
15707c7e 24 compat_urllib_parse_urlencode,
41d06b04 25 compat_urllib_request,
f0b5d6af 26 compat_urlparse,
8c25f81b 27)
b22ca762 28from ..downloader.f4m import remove_encrypted_media
8c25f81b 29from ..utils import (
c342041f 30 NO_DEFAULT,
05900629 31 age_restricted,
08f2a92c 32 bug_reports_message,
d6983cb4
PH
33 clean_html,
34 compiled_regex_type,
70f0f5a8 35 determine_ext,
9b9c5355 36 error_to_compat_str,
d6983cb4 37 ExtractorError,
97f4aecf 38 fix_xml_ampersands,
b14f3a4c 39 float_or_none,
31bb8d3f 40 int_or_none,
4ca2a3cf 41 parse_iso8601,
55b3e45b 42 RegexNotFoundError,
d41e6efc 43 sanitize_filename,
5c2266df 44 sanitized_Request,
f38de77f 45 unescapeHTML,
647eab45 46 unified_strdate,
6b3a3098 47 unified_timestamp,
a107193e 48 url_basename,
a6571f10 49 xpath_element,
8d6765cf
S
50 xpath_text,
51 xpath_with_ns,
d497a201 52 determine_protocol,
1bac3455 53 parse_duration,
cafcf657 54 mimetype2ext,
41d06b04 55 update_Request,
cdfee168 56 update_url_query,
e154c651 57 parse_m3u8_attributes,
59bbe491 58 extract_attributes,
59 parse_codecs,
d6983cb4 60)
c342041f 61
d6983cb4
PH
62
63class InfoExtractor(object):
64 """Information Extractor class.
65
66 Information extractors are the classes that, given a URL, extract
67 information about the video (or videos) the URL refers to. This
68 information includes the real video URL, the video title, author and
69 others. The information is stored in a dictionary which is then
5d380852 70 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
71 information possibly downloading the video to the file system, among
72 other possible outcomes.
73
cf0649f8 74 The type field determines the type of the result.
fed5d032
PH
75 By far the most common value (and the default if _type is missing) is
76 "video", which indicates a single video.
77
78 For a video, the dictionaries must include the following fields:
d6983cb4
PH
79
80 id: Video identifier.
d6983cb4 81 title: Video title, unescaped.
d67b0b15 82
f49d89ee 83 Additionally, it must contain either a formats entry or a url one:
d67b0b15 84
f49d89ee
PH
85 formats: A list of dictionaries for each format available, ordered
86 from worst to best quality.
87
88 Potential fields:
d67b0b15 89 * url Mandatory. The URL of the video file
10952eb2 90 * ext Will be calculated from URL if missing
d67b0b15
PH
91 * format A human-readable description of the format
92 ("mp4 container with h264/opus").
93 Calculated from the format_id, width, height.
94 and format_note fields if missing.
95 * format_id A short description of the format
5d4f3985
PH
96 ("mp4_h264_opus" or "19").
97 Technically optional, but strongly recommended.
d67b0b15
PH
98 * format_note Additional info about the format
99 ("3D" or "DASH video")
100 * width Width of the video, if known
101 * height Height of the video, if known
f49d89ee 102 * resolution Textual description of width and height
7217e148 103 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
104 * abr Average audio bitrate in KBit/s
105 * acodec Name of the audio codec in use
dd27fd17 106 * asr Audio sampling rate in Hertz
d67b0b15 107 * vbr Average video bitrate in KBit/s
fbb21cf5 108 * fps Frame rate
d67b0b15 109 * vcodec Name of the video codec in use
1394ce65 110 * container Name of the container format
d67b0b15 111 * filesize The number of bytes, if known in advance
9732d77e 112 * filesize_approx An estimate for the number of bytes
d67b0b15 113 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
114 * protocol The protocol that will be used for the actual
115 download, lower-case.
b04b8852 116 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 117 "m3u8", "m3u8_native" or "http_dash_segments".
f49d89ee 118 * preference Order number of this format. If this field is
08d13955 119 present and not None, the formats get sorted
38d63d84 120 by this field, regardless of all other values.
f49d89ee
PH
121 -1 for default (order by other properties),
122 -2 or smaller for less than default.
e65566a9
PH
123 < -1000 to hide the format (if there is
124 another one which is strictly better)
32f90364
PH
125 * language Language code, e.g. "de" or "en-US".
126 * language_preference Is this in the language mentioned in
127 the URL?
aff2f4f4
PH
128 10 if it's what the URL is about,
129 -1 for default (don't know),
130 -10 otherwise, other values reserved for now.
5d73273f
PH
131 * quality Order number of the video quality of this
132 format, irrespective of the file format.
133 -1 for default (order by other properties),
134 -2 or smaller for less than default.
c64ed2a3
PH
135 * source_preference Order number for this video source
136 (quality takes higher priority)
137 -1 for default (order by other properties),
138 -2 or smaller for less than default.
d769be6c
PH
139 * http_headers A dictionary of additional HTTP headers
140 to add to the request.
6271f1ca 141 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
142 video's pixels are not square.
143 width : height ratio as float.
144 * no_resume The server does not support resuming the
145 (HTTP or RTMP) download. Boolean.
146
c0ba0f48 147 url: Final video URL.
d6983cb4 148 ext: Video filename extension.
d67b0b15
PH
149 format: The video format, defaults to ext (used for --get-format)
150 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 151
d6983cb4
PH
152 The following fields are optional:
153
f5e43bc6 154 alt_title: A secondary title of the video.
0afef30b
PH
155 display_id An alternative identifier for the video, not necessarily
156 unique, but available before title. Typically, id is
157 something like "4234987", title "Dancing naked mole rats",
158 and display_id "dancing-naked-mole-rats"
d5519808 159 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 160 * "id" (optional, string) - Thumbnail format ID
d5519808 161 * "url"
cfb56d1a 162 * "preference" (optional, int) - quality of the image
d5519808
PH
163 * "width" (optional, int)
164 * "height" (optional, int)
165 * "resolution" (optional, string "{width}x{height"},
166 deprecated)
2de624fd 167 * "filesize" (optional, int)
d6983cb4 168 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 169 description: Full video description.
d6983cb4 170 uploader: Full name of the video uploader.
2bc0c46f 171 license: License name the video is licensed under.
8a92e51c 172 creator: The creator of the video.
8aab976b 173 release_date: The date (YYYYMMDD) when the video was released.
955c4514 174 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 175 upload_date: Video upload date (YYYYMMDD).
955c4514 176 If not explicitly set, calculated from timestamp.
d6983cb4 177 uploader_id: Nickname or id of the video uploader.
7bcd2830 178 uploader_url: Full URL to a personal webpage of the video uploader.
da9ec3b9 179 location: Physical location where the video was filmed.
a504ced0
JMF
180 subtitles: The available subtitles as a dictionary in the format
181 {language: subformats}. "subformats" is a list sorted from
182 lower to higher preference, each element is a dictionary
183 with the "ext" entry and one of:
184 * "data": The subtitles file contents
10952eb2 185 * "url": A URL pointing to the subtitles file
4bba3716 186 "ext" will be calculated from URL if missing
360e1ca5
JMF
187 automatic_captions: Like 'subtitles', used by the YoutubeIE for
188 automatically generated captions
62d231c0 189 duration: Length of the video in seconds, as an integer or float.
f3d29461 190 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
191 like_count: Number of positive ratings of the video
192 dislike_count: Number of negative ratings of the video
02835c6b 193 repost_count: Number of reposts of the video
2d30521a 194 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 195 comment_count: Number of comments on the video
dd622d7c
PH
196 comments: A list of comments, each with one or more of the following
197 properties (all but one of text or html optional):
198 * "author" - human-readable name of the comment author
199 * "author_id" - user ID of the comment author
200 * "id" - Comment ID
201 * "html" - Comment as HTML
202 * "text" - Plain text of the comment
203 * "timestamp" - UNIX timestamp of comment
204 * "parent" - ID of the comment this one is replying to.
205 Set to "root" to indicate that this is a
206 comment to the original video.
8dbe9899 207 age_limit: Age restriction for the video, as an integer (years)
10952eb2 208 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
209 should allow to get the same result again. (It will be set
210 by YoutubeDL if it's missing)
ad3bc6ac
PH
211 categories: A list of categories that the video falls in, for example
212 ["Sports", "Berlin"]
864f24bd 213 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
214 is_live: True, False, or None (=unknown). Whether this video is a
215 live stream that goes on instead of a fixed-length video.
7c80519c 216 start_time: Time in seconds where the reproduction should start, as
10952eb2 217 specified in the URL.
297a564b 218 end_time: Time in seconds where the reproduction should end, as
10952eb2 219 specified in the URL.
d6983cb4 220
7109903e
S
221 The following fields should only be used when the video belongs to some logical
222 chapter or section:
223
224 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
225 chapter_number: Number of the chapter the video belongs to, as an integer.
226 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
227
228 The following fields should only be used when the video is an episode of some
229 series or programme:
230
231 series: Title of the series or programme the video episode belongs to.
232 season: Title of the season the video episode belongs to.
27bfd4e5
S
233 season_number: Number of the season the video episode belongs to, as an integer.
234 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
235 episode: Title of the video episode. Unlike mandatory video title field,
236 this field should denote the exact title of the video episode
237 without any kind of decoration.
27bfd4e5
S
238 episode_number: Number of the video episode within a season, as an integer.
239 episode_id: Id of the video episode, as a unicode string.
7109903e 240
7a93ab5f
S
241 The following fields should only be used when the media is a track or a part of
242 a music album:
243
244 track: Title of the track.
245 track_number: Number of the track within an album or a disc, as an integer.
246 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
247 as a unicode string.
248 artist: Artist(s) of the track.
249 genre: Genre(s) of the track.
250 album: Title of the album the track belongs to.
251 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
252 album_artist: List of all artists appeared on the album (e.g.
253 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
254 and compilations).
255 disc_number: Number of the disc or other physical medium the track belongs to,
256 as an integer.
257 release_year: Year (YYYY) when the album was released.
258
deefc05b 259 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 260
d838b1bd
PH
261 Unless mentioned otherwise, None is equivalent to absence of information.
262
fed5d032
PH
263
264 _type "playlist" indicates multiple videos.
b82f815f
PH
265 There must be a key "entries", which is a list, an iterable, or a PagedList
266 object, each element of which is a valid dictionary by this specification.
fed5d032 267
e0b9d78f
S
268 Additionally, playlists can have "title", "description" and "id" attributes
269 with the same semantics as videos (see above).
fed5d032
PH
270
271
272 _type "multi_video" indicates that there are multiple videos that
273 form a single show, for examples multiple acts of an opera or TV episode.
274 It must have an entries key like a playlist and contain all the keys
275 required for a video at the same time.
276
277
278 _type "url" indicates that the video must be extracted from another
279 location, possibly by a different extractor. Its only required key is:
280 "url" - the next URL to extract.
f58766ce
PH
281 The key "ie_key" can be set to the class name (minus the trailing "IE",
282 e.g. "Youtube") if the extractor class is known in advance.
283 Additionally, the dictionary may have any properties of the resolved entity
284 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
285 known ahead of time.
286
287
288 _type "url_transparent" entities have the same specification as "url", but
289 indicate that the given additional information is more precise than the one
290 associated with the resolved URL.
291 This is useful when a site employs a video service that hosts the video and
292 its technical metadata, but that video service does not embed a useful
293 title, description etc.
294
295
d6983cb4
PH
296 Subclasses of this one should re-define the _real_initialize() and
297 _real_extract() methods and define a _VALID_URL regexp.
298 Probably, they should also be added to the list of extractors.
299
d6983cb4
PH
300 Finally, the _WORKING attribute should be set to False for broken IEs
301 in order to warn the users and skip the tests.
302 """
303
304 _ready = False
305 _downloader = None
306 _WORKING = True
307
308 def __init__(self, downloader=None):
309 """Constructor. Receives an optional downloader."""
310 self._ready = False
311 self.set_downloader(downloader)
312
313 @classmethod
314 def suitable(cls, url):
315 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
316
317 # This does not use has/getattr intentionally - we want to know whether
318 # we have cached the regexp for *this* class, whereas getattr would also
319 # match the superclass
320 if '_VALID_URL_RE' not in cls.__dict__:
321 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
322 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 323
ed9266db
PH
324 @classmethod
325 def _match_id(cls, url):
326 if '_VALID_URL_RE' not in cls.__dict__:
327 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
328 m = cls._VALID_URL_RE.match(url)
329 assert m
330 return m.group('id')
331
d6983cb4
PH
332 @classmethod
333 def working(cls):
334 """Getter method for _WORKING."""
335 return cls._WORKING
336
337 def initialize(self):
338 """Initializes an instance (authentication, etc)."""
339 if not self._ready:
340 self._real_initialize()
341 self._ready = True
342
343 def extract(self, url):
344 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
345 try:
346 self.initialize()
347 return self._real_extract(url)
348 except ExtractorError:
349 raise
350 except compat_http_client.IncompleteRead as e:
dfb1b146 351 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 352 except (KeyError, StopIteration) as e:
dfb1b146 353 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4
PH
354
355 def set_downloader(self, downloader):
356 """Sets the downloader for this IE."""
357 self._downloader = downloader
358
359 def _real_initialize(self):
360 """Real initialization process. Redefine in subclasses."""
361 pass
362
363 def _real_extract(self, url):
364 """Real extraction process. Redefine in subclasses."""
365 pass
366
56c73665
JMF
367 @classmethod
368 def ie_key(cls):
369 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 370 return compat_str(cls.__name__[:-2])
56c73665 371
d6983cb4
PH
372 @property
373 def IE_NAME(self):
dc519b54 374 return compat_str(type(self).__name__[:-2])
d6983cb4 375
41d06b04 376 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
d6983cb4
PH
377 """ Returns the response handle """
378 if note is None:
379 self.report_download_webpage(video_id)
380 elif note is not False:
7cc3570e 381 if video_id is None:
f1a9d64e 382 self.to_screen('%s' % (note,))
7cc3570e 383 else:
f1a9d64e 384 self.to_screen('%s: %s' % (video_id, note))
41d06b04
S
385 if isinstance(url_or_request, compat_urllib_request.Request):
386 url_or_request = update_Request(
387 url_or_request, data=data, headers=headers, query=query)
388 else:
cdfee168 389 if query:
390 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 391 if data is not None or headers:
41d06b04 392 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 393 try:
dca08720 394 return self._downloader.urlopen(url_or_request)
d6983cb4 395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
396 if errnote is False:
397 return False
d6983cb4 398 if errnote is None:
f1a9d64e 399 errnote = 'Unable to download webpage'
7f8b2714 400
9b9c5355 401 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
402 if fatal:
403 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
404 else:
405 self._downloader.report_warning(errmsg)
406 return False
d6983cb4 407
41d06b04 408 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
d6983cb4 409 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
410 # Strip hashes from the URL (#1038)
411 if isinstance(url_or_request, (compat_str, str)):
412 url_or_request = url_or_request.partition('#')[0]
413
cdfee168 414 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
7cc3570e
PH
415 if urlh is False:
416 assert not fatal
417 return False
c9a77969 418 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
419 return (content, urlh)
420
c9a77969
YCH
421 @staticmethod
422 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
423 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
424 if m:
425 encoding = m.group(1)
426 else:
0d75ae2c 427 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
428 webpage_bytes[:1024])
429 if m:
430 encoding = m.group(1).decode('ascii')
b60016e8
PH
431 elif webpage_bytes.startswith(b'\xff\xfe'):
432 encoding = 'utf-16'
f143d86a
PH
433 else:
434 encoding = 'utf-8'
c9a77969
YCH
435
436 return encoding
437
438 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
439 content_type = urlh.headers.get('Content-Type', '')
440 webpage_bytes = urlh.read()
441 if prefix is not None:
442 webpage_bytes = prefix + webpage_bytes
443 if not encoding:
444 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
445 if self._downloader.params.get('dump_intermediate_pages', False):
446 try:
447 url = url_or_request.get_full_url()
448 except AttributeError:
449 url = url_or_request
f1a9d64e 450 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
451 dump = base64.b64encode(webpage_bytes).decode('ascii')
452 self._downloader.to_screen(dump)
d41e6efc
PH
453 if self._downloader.params.get('write_pages', False):
454 try:
455 url = url_or_request.get_full_url()
456 except AttributeError:
457 url = url_or_request
5afa7f8b 458 basen = '%s_%s' % (video_id, url)
c1bce22f 459 if len(basen) > 240:
f1a9d64e 460 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
461 basen = basen[:240 - len(h)] + h
462 raw_filename = basen + '.dump'
d41e6efc 463 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 464 self.to_screen('Saving request to ' + filename)
5f58165d
S
465 # Working around MAX_PATH limitation on Windows (see
466 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 467 if compat_os_name == 'nt':
5f58165d
S
468 absfilepath = os.path.abspath(filename)
469 if len(absfilepath) > 259:
470 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
471 with open(filename, 'wb') as outf:
472 outf.write(webpage_bytes)
473
ec0fafbb
AA
474 try:
475 content = webpage_bytes.decode(encoding, 'replace')
476 except LookupError:
477 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 478
f1a9d64e
PH
479 if ('<title>Access to this site is blocked</title>' in content and
480 'Websense' in content[:512]):
481 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
482 blocked_iframe = self._html_search_regex(
483 r'<iframe src="([^"]+)"', content,
f1a9d64e 484 'Websense information URL', default=None)
2410c43d 485 if blocked_iframe:
f1a9d64e 486 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 487 raise ExtractorError(msg, expected=True)
77b2986b
PH
488 if '<title>The URL you requested has been blocked</title>' in content[:512]:
489 msg = (
490 'Access to this webpage has been blocked by Indian censorship. '
491 'Use a VPN or proxy server (with --proxy) to route around it.')
492 block_msg = self._html_search_regex(
493 r'</h1><p>(.*?)</p>',
494 content, 'block message', default=None)
495 if block_msg:
496 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
497 raise ExtractorError(msg, expected=True)
2410c43d 498
23be51d8 499 return content
d6983cb4 500
41d06b04 501 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
d6983cb4 502 """ Returns the data of the page as a string """
995ad69c
TF
503 success = False
504 try_count = 0
505 while success is False:
506 try:
cdfee168 507 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
995ad69c
TF
508 success = True
509 except compat_http_client.IncompleteRead as e:
510 try_count += 1
511 if try_count >= tries:
512 raise e
513 self._sleep(timeout, video_id)
7cc3570e
PH
514 if res is False:
515 return res
516 else:
517 content, _ = res
518 return content
d6983cb4 519
2a275ab0 520 def _download_xml(self, url_or_request, video_id,
f1a9d64e 521 note='Downloading XML', errnote='Unable to download XML',
41d06b04 522 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
267ed0c5 523 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 524 xml_string = self._download_webpage(
cdfee168 525 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
28746fbd
PH
526 if xml_string is False:
527 return xml_string
e2b38da9
PH
528 if transform_source:
529 xml_string = transform_source(xml_string)
36e6f62c 530 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 531
3d3538e4 532 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
533 note='Downloading JSON metadata',
534 errnote='Unable to download JSON metadata',
b090af59 535 transform_source=None,
41d06b04 536 fatal=True, encoding=None, data=None, headers={}, query={}):
b090af59 537 json_string = self._download_webpage(
c9a77969 538 url_or_request, video_id, note, errnote, fatal=fatal,
cdfee168 539 encoding=encoding, data=data, headers=headers, query=query)
b090af59
PH
540 if (not fatal) and json_string is False:
541 return None
ebb64199
TF
542 return self._parse_json(
543 json_string, video_id, transform_source=transform_source, fatal=fatal)
544
545 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
546 if transform_source:
547 json_string = transform_source(json_string)
3d3538e4
PH
548 try:
549 return json.loads(json_string)
550 except ValueError as ve:
e7b6d122
PH
551 errmsg = '%s: Failed to parse JSON ' % video_id
552 if fatal:
553 raise ExtractorError(errmsg, cause=ve)
554 else:
555 self.report_warning(errmsg + str(ve))
3d3538e4 556
f45f96f8 557 def report_warning(self, msg, video_id=None):
f1a9d64e 558 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 559 self._downloader.report_warning(
f1a9d64e 560 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 561
d6983cb4
PH
562 def to_screen(self, msg):
563 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 564 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
565
566 def report_extraction(self, id_or_name):
567 """Report information extraction."""
f1a9d64e 568 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
569
570 def report_download_webpage(self, video_id):
571 """Report webpage download."""
f1a9d64e 572 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
573
574 def report_age_confirmation(self):
575 """Report attempt to confirm age."""
f1a9d64e 576 self.to_screen('Confirming age')
d6983cb4 577
fc79158d
JMF
578 def report_login(self):
579 """Report attempt to log in."""
f1a9d64e 580 self.to_screen('Logging in')
fc79158d 581
43e7d3c9
S
582 @staticmethod
583 def raise_login_required(msg='This video is only available for registered users'):
584 raise ExtractorError(
585 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
586 expected=True)
587
c430802e
S
588 @staticmethod
589 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
590 raise ExtractorError(
591 '%s. You might want to use --proxy to workaround.' % msg,
592 expected=True)
593
5f6a1245 594 # Methods for following #608
c0d0b01f 595 @staticmethod
830d53bf 596 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 597 """Returns a URL that points to a page that should be processed"""
5f6a1245 598 # TODO: ie should be the class used for getting the info
d6983cb4
PH
599 video_info = {'_type': 'url',
600 'url': url,
601 'ie_key': ie}
7012b23c
PH
602 if video_id is not None:
603 video_info['id'] = video_id
830d53bf
S
604 if video_title is not None:
605 video_info['title'] = video_title
d6983cb4 606 return video_info
5f6a1245 607
c0d0b01f 608 @staticmethod
acf5cbfe 609 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
610 """Returns a playlist"""
611 video_info = {'_type': 'playlist',
612 'entries': entries}
613 if playlist_id:
614 video_info['id'] = playlist_id
615 if playlist_title:
616 video_info['title'] = playlist_title
acf5cbfe
S
617 if playlist_description:
618 video_info['description'] = playlist_description
d6983cb4
PH
619 return video_info
620
c342041f 621 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
622 """
623 Perform a regex search on the given string, using a single or a list of
624 patterns returning the first matching group.
625 In case of failure return a default value or raise a WARNING or a
55b3e45b 626 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
627 """
628 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
629 mobj = re.search(pattern, string, flags)
630 else:
631 for p in pattern:
632 mobj = re.search(p, string, flags)
c3415d1b
PH
633 if mobj:
634 break
d6983cb4 635
e9c0cdd3 636 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 637 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
638 else:
639 _name = name
640
641 if mobj:
711ede6e
PH
642 if group is None:
643 # return the first matching group
644 return next(g for g in mobj.groups() if g is not None)
645 else:
646 return mobj.group(group)
c342041f 647 elif default is not NO_DEFAULT:
d6983cb4
PH
648 return default
649 elif fatal:
f1a9d64e 650 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 651 else:
08f2a92c 652 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
653 return None
654
c342041f 655 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
656 """
657 Like _search_regex, but strips HTML tags and unescapes entities.
658 """
711ede6e 659 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
660 if res:
661 return clean_html(res).strip()
662 else:
663 return res
664
fc79158d
JMF
665 def _get_login_info(self):
666 """
cf0649f8 667 Get the login info as (username, password)
fc79158d
JMF
668 It will look in the netrc file using the _NETRC_MACHINE value
669 If there's no info available, return (None, None)
670 """
671 if self._downloader is None:
672 return (None, None)
673
674 username = None
675 password = None
676 downloader_params = self._downloader.params
677
678 # Attempt to use provided username and password or .netrc data
d800609c 679 if downloader_params.get('username') is not None:
fc79158d
JMF
680 username = downloader_params['username']
681 password = downloader_params['password']
682 elif downloader_params.get('usenetrc', False):
683 try:
684 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
685 if info is not None:
686 username = info[0]
687 password = info[2]
688 else:
689 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
690 except (IOError, netrc.NetrcParseError) as err:
9b9c5355 691 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
5f6a1245 692
fc79158d
JMF
693 return (username, password)
694
e64b7569 695 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 696 """
697 Get the two-factor authentication info
698 TODO - asking the user will be required for sms/phone verify
699 currently just uses the command line option
700 If there's no info available, return None
701 """
702 if self._downloader is None:
83317f69 703 return None
704 downloader_params = self._downloader.params
705
d800609c 706 if downloader_params.get('twofactor') is not None:
83317f69 707 return downloader_params['twofactor']
708
e64b7569 709 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 710
46720279
JMF
711 # Helper functions for extracting OpenGraph info
712 @staticmethod
ab2d5247 713 def _og_regexes(prop):
448ef1f3 714 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
715 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
716 % {'prop': re.escape(prop)})
78fb87b2 717 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 718 return [
78fb87b2
JMF
719 template % (property_re, content_re),
720 template % (content_re, property_re),
ab2d5247 721 ]
46720279 722
864f24bd
S
723 @staticmethod
724 def _meta_regex(prop):
725 return r'''(?isx)<meta
8b9848ac 726 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
727 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
728
3c4e6d83 729 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 730 if name is None:
3c4e6d83 731 name = 'OpenGraph %s' % prop
ab2d5247 732 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
733 if escaped is None:
734 return None
735 return unescapeHTML(escaped)
46720279
JMF
736
737 def _og_search_thumbnail(self, html, **kargs):
10952eb2 738 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
739
740 def _og_search_description(self, html, **kargs):
741 return self._og_search_property('description', html, fatal=False, **kargs)
742
743 def _og_search_title(self, html, **kargs):
744 return self._og_search_property('title', html, **kargs)
745
8ffa13e0 746 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
747 regexes = self._og_regexes('video') + self._og_regexes('video:url')
748 if secure:
749 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 750 return self._html_search_regex(regexes, html, name, **kargs)
46720279 751
78338f71
JMF
752 def _og_search_url(self, html, **kargs):
753 return self._og_search_property('url', html, **kargs)
754
40c696e5 755 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
756 if not isinstance(name, (list, tuple)):
757 name = [name]
59040888 758 if display_name is None:
88d9f6c0 759 display_name = name[0]
59040888 760 return self._html_search_regex(
88d9f6c0 761 [self._meta_regex(n) for n in name],
711ede6e 762 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
763
764 def _dc_search_uploader(self, html):
765 return self._html_search_meta('dc.creator', html, 'uploader')
766
8dbe9899
PH
767 def _rta_search(self, html):
768 # See http://www.rtalabel.org/index.php?content=howtofaq#single
769 if re.search(r'(?ix)<meta\s+name="rating"\s+'
770 r' content="RTA-5042-1996-1400-1577-RTA"',
771 html):
772 return 18
773 return 0
774
59040888
PH
775 def _media_rating_search(self, html):
776 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
777 rating = self._html_search_meta('rating', html)
778
779 if not rating:
780 return None
781
782 RATING_TABLE = {
783 'safe for kids': 0,
784 'general': 8,
785 '14 years': 14,
786 'mature': 17,
787 'restricted': 19,
788 }
d800609c 789 return RATING_TABLE.get(rating.lower())
59040888 790
69319969 791 def _family_friendly_search(self, html):
6ca7732d 792 # See http://schema.org/VideoObject
69319969
NJ
793 family_friendly = self._html_search_meta('isFamilyFriendly', html)
794
795 if not family_friendly:
796 return None
797
798 RATING_TABLE = {
799 '1': 0,
800 'true': 0,
801 '0': 18,
802 'false': 18,
803 }
d800609c 804 return RATING_TABLE.get(family_friendly.lower())
69319969 805
0c708f11
JMF
806 def _twitter_search_player(self, html):
807 return self._html_search_meta('twitter:player', html,
9e1a5b84 808 'twitter card player')
0c708f11 809
95b31e26 810 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4ca2a3cf
S
811 json_ld = self._search_regex(
812 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 813 html, 'JSON-LD', group='json_ld', **kwargs)
4ca2a3cf
S
814 if not json_ld:
815 return {}
95b31e26
S
816 return self._json_ld(
817 json_ld, video_id, fatal=kwargs.get('fatal', True),
818 expected_type=expected_type)
4ca2a3cf 819
95b31e26 820 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
821 if isinstance(json_ld, compat_str):
822 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
823 if not json_ld:
824 return {}
825 info = {}
826 if json_ld.get('@context') == 'http://schema.org':
827 item_type = json_ld.get('@type')
95b31e26
S
828 if expected_type is not None and expected_type != item_type:
829 return info
4ca2a3cf
S
830 if item_type == 'TVEpisode':
831 info.update({
832 'episode': unescapeHTML(json_ld.get('name')),
833 'episode_number': int_or_none(json_ld.get('episodeNumber')),
834 'description': unescapeHTML(json_ld.get('description')),
835 })
836 part_of_season = json_ld.get('partOfSeason')
837 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
838 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
839 part_of_series = json_ld.get('partOfSeries')
840 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
841 info['series'] = unescapeHTML(part_of_series.get('name'))
842 elif item_type == 'Article':
843 info.update({
844 'timestamp': parse_iso8601(json_ld.get('datePublished')),
845 'title': unescapeHTML(json_ld.get('headline')),
846 'description': unescapeHTML(json_ld.get('articleBody')),
847 })
3fee7f63
DR
848 elif item_type == 'VideoObject':
849 info.update({
6b3a3098 850 'url': json_ld.get('contentUrl'),
3fee7f63
DR
851 'title': unescapeHTML(json_ld.get('name')),
852 'description': unescapeHTML(json_ld.get('description')),
6b3a3098
S
853 'thumbnail': json_ld.get('thumbnailUrl'),
854 'duration': parse_duration(json_ld.get('duration')),
855 'timestamp': unified_timestamp(json_ld.get('uploadDate')),
856 'filesize': float_or_none(json_ld.get('contentSize')),
857 'tbr': int_or_none(json_ld.get('bitrate')),
858 'width': int_or_none(json_ld.get('width')),
859 'height': int_or_none(json_ld.get('height')),
3fee7f63 860 })
4ca2a3cf
S
861 return dict((k, v) for k, v in info.items() if v is not None)
862
27713812 863 @staticmethod
f8da79f8 864 def _hidden_inputs(html):
586f1cc5 865 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 866 hidden_inputs = {}
73eb13df 867 for input in re.findall(r'(?i)<input([^>]+)>', html):
be0e5dbd 868 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
201ea3ee 869 continue
bacec039 870 name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
201ea3ee
S
871 if not name:
872 continue
873 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
874 if not value:
875 continue
876 hidden_inputs[name.group('value')] = value.group('value')
877 return hidden_inputs
27713812 878
cf61d96d
S
879 def _form_hidden_inputs(self, form_id, html):
880 form = self._search_regex(
73eb13df 881 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
882 html, '%s form' % form_id, group='form')
883 return self._hidden_inputs(form)
884
3ded7bac 885 def _sort_formats(self, formats, field_preference=None):
7e8caf30 886 if not formats:
f1a9d64e 887 raise ExtractorError('No video formats found')
7e8caf30 888
b0d21ded
S
889 for f in formats:
890 # Automatically determine tbr when missing based on abr and vbr (improves
891 # formats sorting in some cases)
350cf045 892 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
893 f['tbr'] = f['abr'] + f['vbr']
894
4bcc7bd1 895 def _formats_key(f):
e6812ac9
PH
896 # TODO remove the following workaround
897 from ..utils import determine_ext
898 if not f.get('ext') and 'url' in f:
899 f['ext'] = determine_ext(f['url'])
900
3ded7bac 901 if isinstance(field_preference, (list, tuple)):
bf8dd790
S
902 return tuple(
903 f.get(field)
904 if f.get(field) is not None
905 else ('' if field == 'format_id' else -1)
906 for field in field_preference)
3ded7bac 907
4bcc7bd1
PH
908 preference = f.get('preference')
909 if preference is None:
d497a201 910 preference = 0
4bcc7bd1
PH
911 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
912 preference -= 0.5
913
d497a201 914 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
915
4bcc7bd1 916 if f.get('vcodec') == 'none': # audio only
dd867805 917 preference -= 50
4bcc7bd1 918 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 919 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 920 else:
f1a9d64e 921 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
922 ext_preference = 0
923 try:
924 audio_ext_preference = ORDER.index(f['ext'])
925 except ValueError:
926 audio_ext_preference = -1
927 else:
dd867805 928 if f.get('acodec') == 'none': # video only
929 preference -= 40
4bcc7bd1 930 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 931 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 932 else:
f1a9d64e 933 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
934 try:
935 ext_preference = ORDER.index(f['ext'])
936 except ValueError:
937 ext_preference = -1
938 audio_ext_preference = 0
939
940 return (
941 preference,
aff2f4f4 942 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 943 f.get('quality') if f.get('quality') is not None else -1,
9933b574 944 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 945 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 946 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
947 f.get('height') if f.get('height') is not None else -1,
948 f.get('width') if f.get('width') is not None else -1,
d497a201 949 proto_preference,
1e1896f2 950 ext_preference,
4bcc7bd1
PH
951 f.get('abr') if f.get('abr') is not None else -1,
952 audio_ext_preference,
2c8e03d9 953 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 954 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 955 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 956 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
957 )
958 formats.sort(key=_formats_key)
59040888 959
96a53167
S
960 def _check_formats(self, formats, video_id):
961 if formats:
962 formats[:] = filter(
963 lambda f: self._is_valid_url(
964 f['url'], video_id,
965 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
966 formats)
967
f5bdb444
S
968 @staticmethod
969 def _remove_duplicate_formats(formats):
970 format_urls = set()
971 unique_formats = []
972 for f in formats:
973 if f['url'] not in format_urls:
974 format_urls.add(f['url'])
975 unique_formats.append(f)
976 formats[:] = unique_formats
977
96a53167 978 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
979 url = self._proto_relative_url(url, scheme='http:')
980 # For now assume non HTTP(S) URLs always valid
981 if not (url.startswith('http://') or url.startswith('https://')):
982 return True
96a53167 983 try:
4069766c 984 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
985 return True
986 except ExtractorError as e:
943a1e24 987 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
988 self.to_screen(
989 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
990 return False
991 raise
992
20991253 993 def http_scheme(self):
1ede5b24 994 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
995 return (
996 'http:'
997 if self._downloader.params.get('prefer_insecure', False)
998 else 'https:')
999
57c7411f
PH
1000 def _proto_relative_url(self, url, scheme=None):
1001 if url is None:
1002 return url
1003 if url.startswith('//'):
1004 if scheme is None:
1005 scheme = self.http_scheme()
1006 return scheme + url
1007 else:
1008 return url
1009
4094b6e3
PH
1010 def _sleep(self, timeout, video_id, msg_template=None):
1011 if msg_template is None:
f1a9d64e 1012 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1013 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1014 self.to_screen(msg)
1015 time.sleep(timeout)
1016
a38436e8 1017 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310 1018 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1019 fatal=True, m3u8_id=None):
f036a632
JMF
1020 manifest = self._download_xml(
1021 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1022 'Unable to download f4m manifest',
1023 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1024 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
1025 transform_source=transform_source,
1026 fatal=fatal)
1027
1028 if manifest is False:
8d29e47f 1029 return []
31bb8d3f 1030
0fdbb332
S
1031 return self._parse_f4m_formats(
1032 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
448bb5f3 1033 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332
S
1034
1035 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1036 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1037 fatal=True, m3u8_id=None):
fb72ec58 1038 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1039 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1040 if akamai_pv is not None and ';' in akamai_pv.text:
1041 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1042 if playerVerificationChallenge.strip() != '':
1043 return []
1044
31bb8d3f 1045 formats = []
7a47d07c 1046 manifest_version = '1.0'
b2527359 1047 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1048 if not media_nodes:
7a47d07c 1049 manifest_version = '2.0'
34e48bed 1050 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762
S
1051 # Remove unsupported DRM protected media from final formats
1052 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1053 media_nodes = remove_encrypted_media(media_nodes)
1054 if not media_nodes:
1055 return formats
019839fa
S
1056 base_url = xpath_text(
1057 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1058 'base URL', default=None)
1059 if base_url:
1060 base_url = base_url.strip()
0a5685b2 1061
a6571f10 1062 bootstrap_info = xpath_element(
0a5685b2
YCH
1063 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1064 'bootstrap info', default=None)
1065
b2527359 1066 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1067 tbr = int_or_none(media_el.attrib.get('bitrate'))
1068 width = int_or_none(media_el.attrib.get('width'))
1069 height = int_or_none(media_el.attrib.get('height'))
1070 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1071 # If <bootstrapInfo> is present, the specified f4m is a
1072 # stream-level manifest, and only set-level manifests may refer to
1073 # external resources. See section 11.4 and section 4 of F4M spec
1074 if bootstrap_info is None:
1075 media_url = None
1076 # @href is introduced in 2.0, see section 11.6 of F4M spec
1077 if manifest_version == '2.0':
1078 media_url = media_el.attrib.get('href')
1079 if media_url is None:
1080 media_url = media_el.attrib.get('url')
31c746e5
S
1081 if not media_url:
1082 continue
cc357c4d
S
1083 manifest_url = (
1084 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 1085 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1086 # If media_url is itself a f4m manifest do the recursive extraction
1087 # since bitrates in parent manifest (this one) and media_url manifest
1088 # may differ leading to inability to resolve the format by requested
1089 # bitrate in f4m downloader
240b6045
YCH
1090 ext = determine_ext(manifest_url)
1091 if ext == 'f4m':
77b8b4e6 1092 f4m_formats = self._extract_f4m_formats(
0fdbb332 1093 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
77b8b4e6
S
1094 transform_source=transform_source, fatal=fatal)
1095 # Sometimes stream-level manifest contains single media entry that
1096 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1097 # At the same time parent's media entry in set-level manifest may
1098 # contain it. We will copy it from parent in such cases.
1099 if len(f4m_formats) == 1:
1100 f = f4m_formats[0]
1101 f.update({
1102 'tbr': f.get('tbr') or tbr,
1103 'width': f.get('width') or width,
1104 'height': f.get('height') or height,
1105 'format_id': f.get('format_id') if not tbr else format_id,
1106 })
1107 formats.extend(f4m_formats)
70f0f5a8 1108 continue
240b6045
YCH
1109 elif ext == 'm3u8':
1110 formats.extend(self._extract_m3u8_formats(
1111 manifest_url, video_id, 'mp4', preference=preference,
fac2af3c 1112 m3u8_id=m3u8_id, fatal=fatal))
240b6045 1113 continue
31bb8d3f 1114 formats.append({
77b8b4e6 1115 'format_id': format_id,
31bb8d3f 1116 'url': manifest_url,
a6571f10 1117 'ext': 'flv' if bootstrap_info is not None else None,
b2527359 1118 'tbr': tbr,
77b8b4e6
S
1119 'width': width,
1120 'height': height,
60ca389c 1121 'preference': preference,
31bb8d3f 1122 })
31bb8d3f
JMF
1123 return formats
1124
16da9bbc
YCH
1125 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1126 return {
f207019c 1127 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1128 'url': m3u8_url,
1129 'ext': ext,
1130 'protocol': 'm3u8',
9fe6ef7a 1131 'preference': preference - 1 if preference else -1,
704df56d
PH
1132 'resolution': 'multiple',
1133 'format_note': 'Quality selection URL',
16da9bbc
YCH
1134 }
1135
1136 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1137 entry_protocol='m3u8', preference=None,
1138 m3u8_id=None, note=None, errnote=None,
1139 fatal=True, live=False):
1140
1141 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
704df56d 1142
f0b5d6af
PH
1143 format_url = lambda u: (
1144 u
1145 if re.match(r'^https?://', u)
1146 else compat_urlparse.urljoin(m3u8_url, u))
1147
dbd82a1d 1148 res = self._download_webpage_handle(
81515ad9 1149 m3u8_url, video_id,
621ed9f5 1150 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1151 errnote=errnote or 'Failed to download m3u8 information',
1152 fatal=fatal)
dbd82a1d 1153 if res is False:
8d29e47f 1154 return []
dbd82a1d 1155 m3u8_doc, urlh = res
37113045 1156 m3u8_url = urlh.geturl()
9cdffeeb
S
1157
1158 # We should try extracting formats only from master playlists [1], i.e.
1159 # playlists that describe available qualities. On the other hand media
1160 # playlists [2] should be returned as is since they contain just the media
1161 # without qualities renditions.
1162 # Fortunately, master playlist can be easily distinguished from media
1163 # playlist based on particular tags availability. As of [1, 2] master
1164 # playlist tags MUST NOT appear in a media playist and vice versa.
1165 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1166 # and MUST NOT appear in master playlist thus we can clearly detect media
1167 # playlist with this criterion.
1168 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1169 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1170 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1171 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1172 return [{
1173 'url': m3u8_url,
1174 'format_id': m3u8_id,
1175 'ext': ext,
1176 'protocol': entry_protocol,
1177 'preference': preference,
1178 }]
704df56d 1179 last_info = None
fa156077 1180 last_media = None
704df56d
PH
1181 for line in m3u8_doc.splitlines():
1182 if line.startswith('#EXT-X-STREAM-INF:'):
e154c651 1183 last_info = parse_m3u8_attributes(line)
4cd95bcb 1184 elif line.startswith('#EXT-X-MEDIA:'):
e154c651 1185 last_media = parse_m3u8_attributes(line)
704df56d
PH
1186 elif line.startswith('#') or not line.strip():
1187 continue
1188 else:
daebaab6 1189 if last_info is None:
f0b5d6af 1190 formats.append({'url': format_url(line)})
3524cc25 1191 continue
704df56d 1192 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1193 format_id = []
1194 if m3u8_id:
1195 format_id.append(m3u8_id)
7b2fcbfd 1196 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
ed56f260
S
1197 # Despite specification does not mention NAME attribute for
1198 # EXT-X-STREAM-INF it still sometimes may be present
1199 stream_name = last_info.get('NAME') or last_media_name
b24d6336
KH
1200 # Bandwidth of live streams may differ over time thus making
1201 # format_id unpredictable. So it's better to keep provided
1202 # format_id intact.
e9c6cdf4 1203 if not live:
ed56f260 1204 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
704df56d 1205 f = {
8dc9d361 1206 'format_id': '-'.join(format_id),
f0b5d6af 1207 'url': format_url(line.strip()),
704df56d
PH
1208 'tbr': tbr,
1209 'ext': ext,
f0b5d6af
PH
1210 'protocol': entry_protocol,
1211 'preference': preference,
704df56d 1212 }
704df56d
PH
1213 resolution = last_info.get('RESOLUTION')
1214 if resolution:
1215 width_str, height_str = resolution.split('x')
1216 f['width'] = int(width_str)
1217 f['height'] = int(height_str)
fbb6edd2
S
1218 codecs = last_info.get('CODECS')
1219 if codecs:
1220 vcodec, acodec = [None] * 2
1221 va_codecs = codecs.split(',')
1222 if len(va_codecs) == 1:
1223 # Audio only entries usually come with single codec and
1224 # no resolution. For more robustness we also check it to
1225 # be mp4 audio.
1226 if not resolution and va_codecs[0].startswith('mp4a'):
1227 vcodec, acodec = 'none', va_codecs[0]
1228 else:
1229 vcodec = va_codecs[0]
1230 else:
1231 vcodec, acodec = va_codecs[:2]
1232 f.update({
1233 'acodec': acodec,
1234 'vcodec': vcodec,
1235 })
4cd95bcb
JMF
1236 if last_media is not None:
1237 f['m3u8_media'] = last_media
1238 last_media = None
704df56d
PH
1239 formats.append(f)
1240 last_info = {}
704df56d
PH
1241 return formats
1242
a107193e
S
1243 @staticmethod
1244 def _xpath_ns(path, namespace=None):
1245 if not namespace:
1246 return path
1247 out = []
1248 for c in path.split('/'):
1249 if not c or c == '.':
1250 out.append(c)
1251 else:
1252 out.append('{%s}%s' % (namespace, c))
1253 return '/'.join(out)
1254
09f572fb 1255 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1256 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1257
995029a1
PH
1258 if smil is False:
1259 assert not fatal
1260 return []
e89a2aab 1261
17712eeb 1262 namespace = self._parse_smil_namespace(smil)
a107193e
S
1263
1264 return self._parse_smil_formats(
1265 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1266
1267 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1268 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1269 if smil is False:
1270 return {}
1271 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1272
09f572fb 1273 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1274 return self._download_xml(
1275 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1276 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1277
1278 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1279 namespace = self._parse_smil_namespace(smil)
a107193e
S
1280
1281 formats = self._parse_smil_formats(
1282 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1283 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1284
1285 video_id = os.path.splitext(url_basename(smil_url))[0]
1286 title = None
1287 description = None
647eab45 1288 upload_date = None
a107193e
S
1289 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1290 name = meta.attrib.get('name')
1291 content = meta.attrib.get('content')
1292 if not name or not content:
1293 continue
1294 if not title and name == 'title':
1295 title = content
1296 elif not description and name in ('description', 'abstract'):
1297 description = content
647eab45
S
1298 elif not upload_date and name == 'date':
1299 upload_date = unified_strdate(content)
a107193e 1300
1e5bcdec
S
1301 thumbnails = [{
1302 'id': image.get('type'),
1303 'url': image.get('src'),
1304 'width': int_or_none(image.get('width')),
1305 'height': int_or_none(image.get('height')),
1306 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1307
a107193e
S
1308 return {
1309 'id': video_id,
1310 'title': title or video_id,
1311 'description': description,
647eab45 1312 'upload_date': upload_date,
1e5bcdec 1313 'thumbnails': thumbnails,
a107193e
S
1314 'formats': formats,
1315 'subtitles': subtitles,
1316 }
1317
17712eeb
S
1318 def _parse_smil_namespace(self, smil):
1319 return self._search_regex(
1320 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1321
f877c6ae 1322 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1323 base = smil_url
1324 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1325 b = meta.get('base') or meta.get('httpBase')
1326 if b:
1327 base = b
1328 break
e89a2aab
S
1329
1330 formats = []
1331 rtmp_count = 0
a107193e 1332 http_count = 0
7f32e5dc 1333 m3u8_count = 0
a107193e 1334
81e1c4e2 1335 srcs = []
ad96b4c8
YCH
1336 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1337 for medium in media:
1338 src = medium.get('src')
81e1c4e2 1339 if not src or src in srcs:
a107193e 1340 continue
81e1c4e2 1341 srcs.append(src)
a107193e 1342
ad96b4c8
YCH
1343 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1344 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1345 width = int_or_none(medium.get('width'))
1346 height = int_or_none(medium.get('height'))
1347 proto = medium.get('proto')
1348 ext = medium.get('ext')
a107193e 1349 src_ext = determine_ext(src)
ad96b4c8 1350 streamer = medium.get('streamer') or base
a107193e
S
1351
1352 if proto == 'rtmp' or streamer.startswith('rtmp'):
1353 rtmp_count += 1
1354 formats.append({
1355 'url': streamer,
1356 'play_path': src,
1357 'ext': 'flv',
1358 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1359 'tbr': bitrate,
1360 'filesize': filesize,
1361 'width': width,
1362 'height': height,
1363 })
f877c6ae
YCH
1364 if transform_rtmp_url:
1365 streamer, src = transform_rtmp_url(streamer, src)
1366 formats[-1].update({
1367 'url': streamer,
1368 'play_path': src,
1369 })
a107193e
S
1370 continue
1371
1372 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1373 src_url = src_url.strip()
a107193e
S
1374
1375 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1376 m3u8_formats = self._extract_m3u8_formats(
1377 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1378 if len(m3u8_formats) == 1:
1379 m3u8_count += 1
1380 m3u8_formats[0].update({
1381 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1382 'tbr': bitrate,
1383 'width': width,
1384 'height': height,
1385 })
1386 formats.extend(m3u8_formats)
a107193e
S
1387 continue
1388
1389 if src_ext == 'f4m':
1390 f4m_url = src_url
1391 if not f4m_params:
1392 f4m_params = {
1393 'hdcore': '3.2.0',
1394 'plugin': 'flowplayer-3.2.0.1',
1395 }
1396 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 1397 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 1398 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1399 continue
1400
c78e4817 1401 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1402 http_count += 1
1403 formats.append({
1404 'url': src_url,
1405 'ext': ext or src_ext or 'flv',
1406 'format_id': 'http-%d' % (bitrate or http_count),
1407 'tbr': bitrate,
1408 'filesize': filesize,
1409 'width': width,
1410 'height': height,
1411 })
1412 continue
63757032 1413
e89a2aab
S
1414 return formats
1415
ce00af87 1416 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1417 urls = []
a107193e
S
1418 subtitles = {}
1419 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1420 src = textstream.get('src')
d413095f 1421 if not src or src in urls:
a107193e 1422 continue
d413095f 1423 urls.append(src)
df634be2 1424 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 1425 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1426 subtitles.setdefault(lang, []).append({
1427 'url': src,
1428 'ext': ext,
1429 })
1430 return subtitles
63757032 1431
942acef5
S
1432 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1433 xspf = self._download_xml(
8d6765cf 1434 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1435 'Unable to download xspf manifest', fatal=fatal)
1436 if xspf is False:
1437 return []
1438 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1439
942acef5 1440 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1441 NS_MAP = {
1442 'xspf': 'http://xspf.org/ns/0/',
1443 's1': 'http://static.streamone.nl/player/ns/0',
1444 }
1445
1446 entries = []
1447 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1448 title = xpath_text(
98044462 1449 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1450 description = xpath_text(
1451 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1452 thumbnail = xpath_text(
1453 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1454 duration = float_or_none(
1455 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1456
1457 formats = [{
1458 'url': location.text,
1459 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1460 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1461 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1462 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1463 self._sort_formats(formats)
1464
1465 entries.append({
1466 'id': playlist_id,
1467 'title': title,
1468 'description': description,
1469 'thumbnail': thumbnail,
1470 'duration': duration,
1471 'formats': formats,
1472 })
1473 return entries
1474
1bac3455 1475 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1476 res = self._download_webpage_handle(
1477 mpd_url, video_id,
1478 note=note or 'Downloading MPD manifest',
1479 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1480 fatal=fatal)
1bac3455 1481 if res is False:
2d2fa82d 1482 return []
1bac3455 1483 mpd, urlh = res
1484 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1485
91cb6b50 1486 return self._parse_mpd_formats(
1bac3455 1487 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
2d2fa82d 1488
91cb6b50 1489 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1bac3455 1490 if mpd_doc.get('type') == 'dynamic':
1491 return []
2d2fa82d 1492
91cb6b50 1493 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1494
1495 def _add_ns(path):
1496 return self._xpath_ns(path, namespace)
1497
675d0016 1498 def is_drm_protected(element):
1499 return element.find(_add_ns('ContentProtection')) is not None
1500
1bac3455 1501 def extract_multisegment_info(element, ms_parent_info):
1502 ms_info = ms_parent_info.copy()
f14be228 1503 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1504 if segment_list is not None:
f14be228 1505 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1506 if segment_urls_e:
1507 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
f14be228 1508 initialization = segment_list.find(_add_ns('Initialization'))
1bac3455 1509 if initialization is not None:
1510 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1511 else:
f14be228 1512 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1513 if segment_template is not None:
1514 start_number = segment_template.get('startNumber')
1515 if start_number:
1516 ms_info['start_number'] = int(start_number)
f14be228 1517 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1bac3455 1518 if segment_timeline is not None:
f14be228 1519 s_e = segment_timeline.findall(_add_ns('S'))
1bac3455 1520 if s_e:
1521 ms_info['total_number'] = 0
1522 for s in s_e:
1523 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1524 else:
1525 timescale = segment_template.get('timescale')
1526 if timescale:
1527 ms_info['timescale'] = int(timescale)
1528 segment_duration = segment_template.get('duration')
1529 if segment_duration:
1530 ms_info['segment_duration'] = int(segment_duration)
1531 media_template = segment_template.get('media')
1532 if media_template:
1533 ms_info['media_template'] = media_template
1534 initialization = segment_template.get('initialization')
1535 if initialization:
1536 ms_info['initialization_url'] = initialization
1537 else:
f14be228 1538 initialization = segment_template.find(_add_ns('Initialization'))
1bac3455 1539 if initialization is not None:
1540 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1541 return ms_info
b323e170 1542
1bac3455 1543 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1544 formats = []
f14be228 1545 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1546 period_duration = parse_duration(period.get('duration')) or mpd_duration
1547 period_ms_info = extract_multisegment_info(period, {
1548 'start_number': 1,
1549 'timescale': 1,
1550 })
f14be228 1551 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1552 if is_drm_protected(adaptation_set):
1553 continue
1bac3455 1554 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1555 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1556 if is_drm_protected(representation):
1557 continue
1bac3455 1558 representation_attrib = adaptation_set.attrib.copy()
1559 representation_attrib.update(representation.attrib)
a6c8b759
YCH
1560 # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1561 mime_type = representation_attrib['mimeType']
1562 content_type = mime_type.split('/')[0]
1bac3455 1563 if content_type == 'text':
1564 # TODO implement WebVTT downloading
1565 pass
1566 elif content_type == 'video' or content_type == 'audio':
1567 base_url = ''
1568 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1569 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1570 if base_url_e is not None:
1571 base_url = base_url_e.text + base_url
1572 if re.match(r'^https?://', base_url):
1573 break
bb20526b
S
1574 if mpd_base_url and not re.match(r'^https?://', base_url):
1575 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1576 mpd_base_url += '/'
1bac3455 1577 base_url = mpd_base_url + base_url
1578 representation_id = representation_attrib.get('id')
d577c796 1579 lang = representation_attrib.get('lang')
51e9094f 1580 url_el = representation.find(_add_ns('BaseURL'))
1581 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1bac3455 1582 f = {
154c209e 1583 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 1584 'url': base_url,
a6c8b759 1585 'ext': mimetype2ext(mime_type),
1bac3455 1586 'width': int_or_none(representation_attrib.get('width')),
1587 'height': int_or_none(representation_attrib.get('height')),
1588 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1589 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1590 'fps': int_or_none(representation_attrib.get('frameRate')),
1591 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1592 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
d577c796 1593 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 1594 'format_note': 'DASH %s' % content_type,
51e9094f 1595 'filesize': filesize,
1bac3455 1596 }
1597 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1598 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1599 if 'total_number' not in representation_ms_info and 'segment_duration':
6a3828fd 1600 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1601 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1bac3455 1602 media_template = representation_ms_info['media_template']
1603 media_template = media_template.replace('$RepresentationID$', representation_id)
db8ee7ec 1604 media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
fb38aa8b 1605 media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1bac3455 1606 media_template.replace('$$', '$')
b507cc92
S
1607 representation_ms_info['segment_urls'] = [
1608 media_template % {
1609 'Number': segment_number,
1610 'Bandwidth': representation_attrib.get('bandwidth')}
1611 for segment_number in range(
1612 representation_ms_info['start_number'],
1613 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1bac3455 1614 if 'segment_urls' in representation_ms_info:
1615 f.update({
1616 'segment_urls': representation_ms_info['segment_urls'],
1617 'protocol': 'http_dash_segments',
df374b52 1618 })
1bac3455 1619 if 'initialization_url' in representation_ms_info:
1620 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1621 f.update({
1622 'initialization_url': initialization_url,
1623 })
1624 if not f.get('url'):
1625 f['url'] = initialization_url
1626 try:
1627 existing_format = next(
1628 fo for fo in formats
1629 if fo['format_id'] == representation_id)
1630 except StopIteration:
1631 full_info = formats_dict.get(representation_id, {}).copy()
1632 full_info.update(f)
1633 formats.append(full_info)
1634 else:
1635 existing_format.update(f)
17b598d3 1636 else:
1bac3455 1637 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
1638 return formats
1639
59bbe491 1640 def _parse_html5_media_entries(self, base_url, webpage):
1641 def absolute_url(video_url):
1642 return compat_urlparse.urljoin(base_url, video_url)
1643
1644 def parse_content_type(content_type):
1645 if not content_type:
1646 return {}
1647 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1648 if ctr:
1649 mimetype, codecs = ctr.groups()
1650 f = parse_codecs(codecs)
1651 f['ext'] = mimetype2ext(mimetype)
1652 return f
1653 return {}
1654
1655 entries = []
1656 for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
1657 media_info = {
1658 'formats': [],
1659 'subtitles': {},
1660 }
1661 media_attributes = extract_attributes(media_tag)
1662 src = media_attributes.get('src')
1663 if src:
1664 media_info['formats'].append({
1665 'url': absolute_url(src),
1666 'vcodec': 'none' if media_type == 'audio' else None,
1667 })
1668 media_info['thumbnail'] = media_attributes.get('poster')
1669 if media_content:
1670 for source_tag in re.findall(r'<source[^>]+>', media_content):
1671 source_attributes = extract_attributes(source_tag)
1672 src = source_attributes.get('src')
1673 if not src:
1674 continue
1675 f = parse_content_type(source_attributes.get('type'))
1676 f.update({
1677 'url': absolute_url(src),
1678 'vcodec': 'none' if media_type == 'audio' else None,
1679 })
1680 media_info['formats'].append(f)
1681 for track_tag in re.findall(r'<track[^>]+>', media_content):
1682 track_attributes = extract_attributes(track_tag)
1683 kind = track_attributes.get('kind')
1684 if not kind or kind == 'subtitles':
1685 src = track_attributes.get('src')
1686 if not src:
1687 continue
1688 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1689 media_info['subtitles'].setdefault(lang, []).append({
1690 'url': absolute_url(src),
1691 })
1692 if media_info['formats']:
1693 entries.append(media_info)
1694 return entries
1695
f4b1c7ad
PH
1696 def _live_title(self, name):
1697 """ Generate the title for a live video """
1698 now = datetime.datetime.now()
611c1dd9 1699 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
1700 return name + ' ' + now_str
1701
b14f3a4c
PH
1702 def _int(self, v, name, fatal=False, **kwargs):
1703 res = int_or_none(v, **kwargs)
1704 if 'get_attr' in kwargs:
1705 print(getattr(v, kwargs['get_attr']))
1706 if res is None:
1707 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1708 if fatal:
1709 raise ExtractorError(msg)
1710 else:
1711 self._downloader.report_warning(msg)
1712 return res
1713
1714 def _float(self, v, name, fatal=False, **kwargs):
1715 res = float_or_none(v, **kwargs)
1716 if res is None:
1717 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1718 if fatal:
1719 raise ExtractorError(msg)
1720 else:
1721 self._downloader.report_warning(msg)
1722 return res
1723
42939b61 1724 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1725 cookie = compat_cookiejar.Cookie(
1726 0, name, value, None, None, domain, None,
42939b61
JMF
1727 None, '/', True, False, expire_time, '', None, None, None)
1728 self._downloader.cookiejar.set_cookie(cookie)
1729
799207e8 1730 def _get_cookies(self, url):
1731 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 1732 req = sanitized_Request(url)
799207e8 1733 self._downloader.cookiejar.add_cookie_header(req)
1734 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1735
05900629
PH
1736 def get_testcases(self, include_onlymatching=False):
1737 t = getattr(self, '_TEST', None)
1738 if t:
1739 assert not hasattr(self, '_TESTS'), \
1740 '%s has _TEST and _TESTS' % type(self).__name__
1741 tests = [t]
1742 else:
1743 tests = getattr(self, '_TESTS', [])
1744 for t in tests:
1745 if not include_onlymatching and t.get('only_matching', False):
1746 continue
1747 t['name'] = type(self).__name__[:-len('IE')]
1748 yield t
1749
1750 def is_suitable(self, age_limit):
1751 """ Test whether the extractor is generally suitable for the given
1752 age limit (i.e. pornographic sites are not, all others usually are) """
1753
1754 any_restricted = False
1755 for tc in self.get_testcases(include_onlymatching=False):
1756 if 'playlist' in tc:
1757 tc = tc['playlist'][0]
1758 is_restricted = age_restricted(
1759 tc.get('info_dict', {}).get('age_limit'), age_limit)
1760 if not is_restricted:
1761 return True
1762 any_restricted = any_restricted or is_restricted
1763 return not any_restricted
1764
a504ced0 1765 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1766 if (self._downloader.params.get('writesubtitles', False) or
1767 self._downloader.params.get('listsubtitles')):
1768 return self._get_subtitles(*args, **kwargs)
1769 return {}
a504ced0
JMF
1770
1771 def _get_subtitles(self, *args, **kwargs):
611c1dd9 1772 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 1773
912e0b7e
YCH
1774 @staticmethod
1775 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1776 """ Merge subtitle items for one language. Items with duplicated URLs
1777 will be dropped. """
1778 list1_urls = set([item['url'] for item in subtitle_list1])
1779 ret = list(subtitle_list1)
1780 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1781 return ret
1782
1783 @classmethod
8c97f819 1784 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 1785 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
1786 ret = dict(subtitle_dict1)
1787 for lang in subtitle_dict2:
8c97f819 1788 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
1789 return ret
1790
360e1ca5 1791 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1792 if (self._downloader.params.get('writeautomaticsub', False) or
1793 self._downloader.params.get('listsubtitles')):
1794 return self._get_automatic_captions(*args, **kwargs)
1795 return {}
360e1ca5
JMF
1796
1797 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 1798 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 1799
d77ab8e2
S
1800 def mark_watched(self, *args, **kwargs):
1801 if (self._downloader.params.get('mark_watched', False) and
1802 (self._get_login_info()[0] is not None or
1803 self._downloader.params.get('cookiefile') is not None)):
1804 self._mark_watched(*args, **kwargs)
1805
1806 def _mark_watched(self, *args, **kwargs):
1807 raise NotImplementedError('This method must be implemented by subclasses')
1808
38cce791
YCH
1809 def geo_verification_headers(self):
1810 headers = {}
1811 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
1812 if geo_verification_proxy:
1813 headers['Ytdl-request-proxy'] = geo_verification_proxy
1814 return headers
1815
8dbe9899 1816
d6983cb4
PH
1817class SearchInfoExtractor(InfoExtractor):
1818 """
1819 Base class for paged search queries extractors.
10952eb2 1820 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1821 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1822 """
1823
1824 @classmethod
1825 def _make_valid_url(cls):
1826 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1827
1828 @classmethod
1829 def suitable(cls, url):
1830 return re.match(cls._make_valid_url(), url) is not None
1831
1832 def _real_extract(self, query):
1833 mobj = re.match(self._make_valid_url(), query)
1834 if mobj is None:
f1a9d64e 1835 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1836
1837 prefix = mobj.group('prefix')
1838 query = mobj.group('query')
1839 if prefix == '':
1840 return self._get_n_results(query, 1)
1841 elif prefix == 'all':
1842 return self._get_n_results(query, self._MAX_RESULTS)
1843 else:
1844 n = int(prefix)
1845 if n <= 0:
f1a9d64e 1846 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1847 elif n > self._MAX_RESULTS:
f1a9d64e 1848 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1849 n = self._MAX_RESULTS
1850 return self._get_n_results(query, n)
1851
1852 def _get_n_results(self, query, n):
1853 """Get a specified number of results for a query"""
611c1dd9 1854 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
1855
1856 @property
1857 def SEARCH_KEY(self):
1858 return self._SEARCH_KEY