]>
Commit | Line | Data |
---|---|---|
6a3828fd | 1 | from __future__ import unicode_literals |
f1a9d64e | 2 | |
d6983cb4 | 3 | import base64 |
f4b1c7ad | 4 | import datetime |
3ec05685 | 5 | import hashlib |
3d3538e4 | 6 | import json |
4094b6e3 | 7 | import netrc |
d6983cb4 PH |
8 | import os |
9 | import re | |
10 | import socket | |
11 | import sys | |
4094b6e3 | 12 | import time |
1bac3455 | 13 | import math |
d6983cb4 | 14 | |
8c25f81b | 15 | from ..compat import ( |
42939b61 | 16 | compat_cookiejar, |
799207e8 | 17 | compat_cookies, |
e9c0cdd3 | 18 | compat_etree_fromstring, |
e64b7569 | 19 | compat_getpass, |
d6983cb4 | 20 | compat_http_client, |
e9c0cdd3 YCH |
21 | compat_os_name, |
22 | compat_str, | |
d6983cb4 | 23 | compat_urllib_error, |
15707c7e | 24 | compat_urllib_parse_urlencode, |
41d06b04 | 25 | compat_urllib_request, |
f0b5d6af | 26 | compat_urlparse, |
8c25f81b | 27 | ) |
b22ca762 | 28 | from ..downloader.f4m import remove_encrypted_media |
8c25f81b | 29 | from ..utils import ( |
c342041f | 30 | NO_DEFAULT, |
05900629 | 31 | age_restricted, |
08f2a92c | 32 | bug_reports_message, |
d6983cb4 PH |
33 | clean_html, |
34 | compiled_regex_type, | |
70f0f5a8 | 35 | determine_ext, |
9b9c5355 | 36 | error_to_compat_str, |
d6983cb4 | 37 | ExtractorError, |
97f4aecf | 38 | fix_xml_ampersands, |
b14f3a4c | 39 | float_or_none, |
31bb8d3f | 40 | int_or_none, |
4ca2a3cf | 41 | parse_iso8601, |
55b3e45b | 42 | RegexNotFoundError, |
d41e6efc | 43 | sanitize_filename, |
5c2266df | 44 | sanitized_Request, |
f38de77f | 45 | unescapeHTML, |
647eab45 | 46 | unified_strdate, |
a107193e | 47 | url_basename, |
8d6765cf S |
48 | xpath_text, |
49 | xpath_with_ns, | |
d497a201 | 50 | determine_protocol, |
1bac3455 | 51 | parse_duration, |
cafcf657 | 52 | mimetype2ext, |
41d06b04 | 53 | update_Request, |
cdfee168 | 54 | update_url_query, |
d6983cb4 | 55 | ) |
c342041f | 56 | |
d6983cb4 PH |
57 | |
58 | class InfoExtractor(object): | |
59 | """Information Extractor class. | |
60 | ||
61 | Information extractors are the classes that, given a URL, extract | |
62 | information about the video (or videos) the URL refers to. This | |
63 | information includes the real video URL, the video title, author and | |
64 | others. The information is stored in a dictionary which is then | |
5d380852 | 65 | passed to the YoutubeDL. The YoutubeDL processes this |
d6983cb4 PH |
66 | information possibly downloading the video to the file system, among |
67 | other possible outcomes. | |
68 | ||
cf0649f8 | 69 | The type field determines the type of the result. |
fed5d032 PH |
70 | By far the most common value (and the default if _type is missing) is |
71 | "video", which indicates a single video. | |
72 | ||
73 | For a video, the dictionaries must include the following fields: | |
d6983cb4 PH |
74 | |
75 | id: Video identifier. | |
d6983cb4 | 76 | title: Video title, unescaped. |
d67b0b15 | 77 | |
f49d89ee | 78 | Additionally, it must contain either a formats entry or a url one: |
d67b0b15 | 79 | |
f49d89ee PH |
80 | formats: A list of dictionaries for each format available, ordered |
81 | from worst to best quality. | |
82 | ||
83 | Potential fields: | |
d67b0b15 | 84 | * url Mandatory. The URL of the video file |
10952eb2 | 85 | * ext Will be calculated from URL if missing |
d67b0b15 PH |
86 | * format A human-readable description of the format |
87 | ("mp4 container with h264/opus"). | |
88 | Calculated from the format_id, width, height. | |
89 | and format_note fields if missing. | |
90 | * format_id A short description of the format | |
5d4f3985 PH |
91 | ("mp4_h264_opus" or "19"). |
92 | Technically optional, but strongly recommended. | |
d67b0b15 PH |
93 | * format_note Additional info about the format |
94 | ("3D" or "DASH video") | |
95 | * width Width of the video, if known | |
96 | * height Height of the video, if known | |
f49d89ee | 97 | * resolution Textual description of width and height |
7217e148 | 98 | * tbr Average bitrate of audio and video in KBit/s |
d67b0b15 PH |
99 | * abr Average audio bitrate in KBit/s |
100 | * acodec Name of the audio codec in use | |
dd27fd17 | 101 | * asr Audio sampling rate in Hertz |
d67b0b15 | 102 | * vbr Average video bitrate in KBit/s |
fbb21cf5 | 103 | * fps Frame rate |
d67b0b15 | 104 | * vcodec Name of the video codec in use |
1394ce65 | 105 | * container Name of the container format |
d67b0b15 | 106 | * filesize The number of bytes, if known in advance |
9732d77e | 107 | * filesize_approx An estimate for the number of bytes |
d67b0b15 | 108 | * player_url SWF Player URL (used for rtmpdump). |
c7deaa4c PH |
109 | * protocol The protocol that will be used for the actual |
110 | download, lower-case. | |
b04b8852 | 111 | "http", "https", "rtsp", "rtmp", "rtmpe", |
af7d5a63 | 112 | "m3u8", "m3u8_native" or "http_dash_segments". |
f49d89ee | 113 | * preference Order number of this format. If this field is |
08d13955 | 114 | present and not None, the formats get sorted |
38d63d84 | 115 | by this field, regardless of all other values. |
f49d89ee PH |
116 | -1 for default (order by other properties), |
117 | -2 or smaller for less than default. | |
e65566a9 PH |
118 | < -1000 to hide the format (if there is |
119 | another one which is strictly better) | |
32f90364 PH |
120 | * language Language code, e.g. "de" or "en-US". |
121 | * language_preference Is this in the language mentioned in | |
122 | the URL? | |
aff2f4f4 PH |
123 | 10 if it's what the URL is about, |
124 | -1 for default (don't know), | |
125 | -10 otherwise, other values reserved for now. | |
5d73273f PH |
126 | * quality Order number of the video quality of this |
127 | format, irrespective of the file format. | |
128 | -1 for default (order by other properties), | |
129 | -2 or smaller for less than default. | |
c64ed2a3 PH |
130 | * source_preference Order number for this video source |
131 | (quality takes higher priority) | |
132 | -1 for default (order by other properties), | |
133 | -2 or smaller for less than default. | |
d769be6c PH |
134 | * http_headers A dictionary of additional HTTP headers |
135 | to add to the request. | |
6271f1ca | 136 | * stretched_ratio If given and not 1, indicates that the |
3dee7826 PH |
137 | video's pixels are not square. |
138 | width : height ratio as float. | |
139 | * no_resume The server does not support resuming the | |
140 | (HTTP or RTMP) download. Boolean. | |
141 | ||
c0ba0f48 | 142 | url: Final video URL. |
d6983cb4 | 143 | ext: Video filename extension. |
d67b0b15 PH |
144 | format: The video format, defaults to ext (used for --get-format) |
145 | player_url: SWF Player URL (used for rtmpdump). | |
2f5865cc | 146 | |
d6983cb4 PH |
147 | The following fields are optional: |
148 | ||
f5e43bc6 | 149 | alt_title: A secondary title of the video. |
0afef30b PH |
150 | display_id An alternative identifier for the video, not necessarily |
151 | unique, but available before title. Typically, id is | |
152 | something like "4234987", title "Dancing naked mole rats", | |
153 | and display_id "dancing-naked-mole-rats" | |
d5519808 | 154 | thumbnails: A list of dictionaries, with the following entries: |
cfb56d1a | 155 | * "id" (optional, string) - Thumbnail format ID |
d5519808 | 156 | * "url" |
cfb56d1a | 157 | * "preference" (optional, int) - quality of the image |
d5519808 PH |
158 | * "width" (optional, int) |
159 | * "height" (optional, int) | |
160 | * "resolution" (optional, string "{width}x{height"}, | |
161 | deprecated) | |
d6983cb4 | 162 | thumbnail: Full URL to a video thumbnail image. |
f5e43bc6 | 163 | description: Full video description. |
d6983cb4 | 164 | uploader: Full name of the video uploader. |
2bc0c46f | 165 | license: License name the video is licensed under. |
9bb8e0a3 | 166 | creator: The main artist who created the video. |
8aab976b | 167 | release_date: The date (YYYYMMDD) when the video was released. |
955c4514 | 168 | timestamp: UNIX timestamp of the moment the video became available. |
d6983cb4 | 169 | upload_date: Video upload date (YYYYMMDD). |
955c4514 | 170 | If not explicitly set, calculated from timestamp. |
d6983cb4 | 171 | uploader_id: Nickname or id of the video uploader. |
7bcd2830 | 172 | uploader_url: Full URL to a personal webpage of the video uploader. |
da9ec3b9 | 173 | location: Physical location where the video was filmed. |
a504ced0 JMF |
174 | subtitles: The available subtitles as a dictionary in the format |
175 | {language: subformats}. "subformats" is a list sorted from | |
176 | lower to higher preference, each element is a dictionary | |
177 | with the "ext" entry and one of: | |
178 | * "data": The subtitles file contents | |
10952eb2 | 179 | * "url": A URL pointing to the subtitles file |
4bba3716 | 180 | "ext" will be calculated from URL if missing |
360e1ca5 JMF |
181 | automatic_captions: Like 'subtitles', used by the YoutubeIE for |
182 | automatically generated captions | |
62d231c0 | 183 | duration: Length of the video in seconds, as an integer or float. |
f3d29461 | 184 | view_count: How many users have watched the video on the platform. |
19e3dfc9 PH |
185 | like_count: Number of positive ratings of the video |
186 | dislike_count: Number of negative ratings of the video | |
02835c6b | 187 | repost_count: Number of reposts of the video |
2d30521a | 188 | average_rating: Average rating give by users, the scale used depends on the webpage |
19e3dfc9 | 189 | comment_count: Number of comments on the video |
dd622d7c PH |
190 | comments: A list of comments, each with one or more of the following |
191 | properties (all but one of text or html optional): | |
192 | * "author" - human-readable name of the comment author | |
193 | * "author_id" - user ID of the comment author | |
194 | * "id" - Comment ID | |
195 | * "html" - Comment as HTML | |
196 | * "text" - Plain text of the comment | |
197 | * "timestamp" - UNIX timestamp of comment | |
198 | * "parent" - ID of the comment this one is replying to. | |
199 | Set to "root" to indicate that this is a | |
200 | comment to the original video. | |
8dbe9899 | 201 | age_limit: Age restriction for the video, as an integer (years) |
10952eb2 | 202 | webpage_url: The URL to the video webpage, if given to youtube-dl it |
9103bbc5 JMF |
203 | should allow to get the same result again. (It will be set |
204 | by YoutubeDL if it's missing) | |
ad3bc6ac PH |
205 | categories: A list of categories that the video falls in, for example |
206 | ["Sports", "Berlin"] | |
864f24bd | 207 | tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] |
7267bd53 PH |
208 | is_live: True, False, or None (=unknown). Whether this video is a |
209 | live stream that goes on instead of a fixed-length video. | |
7c80519c | 210 | start_time: Time in seconds where the reproduction should start, as |
10952eb2 | 211 | specified in the URL. |
297a564b | 212 | end_time: Time in seconds where the reproduction should end, as |
10952eb2 | 213 | specified in the URL. |
d6983cb4 | 214 | |
7109903e S |
215 | The following fields should only be used when the video belongs to some logical |
216 | chapter or section: | |
217 | ||
218 | chapter: Name or title of the chapter the video belongs to. | |
27bfd4e5 S |
219 | chapter_number: Number of the chapter the video belongs to, as an integer. |
220 | chapter_id: Id of the chapter the video belongs to, as a unicode string. | |
7109903e S |
221 | |
222 | The following fields should only be used when the video is an episode of some | |
223 | series or programme: | |
224 | ||
225 | series: Title of the series or programme the video episode belongs to. | |
226 | season: Title of the season the video episode belongs to. | |
27bfd4e5 S |
227 | season_number: Number of the season the video episode belongs to, as an integer. |
228 | season_id: Id of the season the video episode belongs to, as a unicode string. | |
7109903e S |
229 | episode: Title of the video episode. Unlike mandatory video title field, |
230 | this field should denote the exact title of the video episode | |
231 | without any kind of decoration. | |
27bfd4e5 S |
232 | episode_number: Number of the video episode within a season, as an integer. |
233 | episode_id: Id of the video episode, as a unicode string. | |
7109903e | 234 | |
7a93ab5f S |
235 | The following fields should only be used when the media is a track or a part of |
236 | a music album: | |
237 | ||
238 | track: Title of the track. | |
239 | track_number: Number of the track within an album or a disc, as an integer. | |
240 | track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), | |
241 | as a unicode string. | |
242 | artist: Artist(s) of the track. | |
243 | genre: Genre(s) of the track. | |
244 | album: Title of the album the track belongs to. | |
245 | album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). | |
246 | album_artist: List of all artists appeared on the album (e.g. | |
247 | "Ash Borer / Fell Voices" or "Various Artists", useful for splits | |
248 | and compilations). | |
249 | disc_number: Number of the disc or other physical medium the track belongs to, | |
250 | as an integer. | |
251 | release_year: Year (YYYY) when the album was released. | |
252 | ||
deefc05b | 253 | Unless mentioned otherwise, the fields should be Unicode strings. |
d6983cb4 | 254 | |
d838b1bd PH |
255 | Unless mentioned otherwise, None is equivalent to absence of information. |
256 | ||
fed5d032 PH |
257 | |
258 | _type "playlist" indicates multiple videos. | |
b82f815f PH |
259 | There must be a key "entries", which is a list, an iterable, or a PagedList |
260 | object, each element of which is a valid dictionary by this specification. | |
fed5d032 | 261 | |
e0b9d78f S |
262 | Additionally, playlists can have "title", "description" and "id" attributes |
263 | with the same semantics as videos (see above). | |
fed5d032 PH |
264 | |
265 | ||
266 | _type "multi_video" indicates that there are multiple videos that | |
267 | form a single show, for examples multiple acts of an opera or TV episode. | |
268 | It must have an entries key like a playlist and contain all the keys | |
269 | required for a video at the same time. | |
270 | ||
271 | ||
272 | _type "url" indicates that the video must be extracted from another | |
273 | location, possibly by a different extractor. Its only required key is: | |
274 | "url" - the next URL to extract. | |
f58766ce PH |
275 | The key "ie_key" can be set to the class name (minus the trailing "IE", |
276 | e.g. "Youtube") if the extractor class is known in advance. | |
277 | Additionally, the dictionary may have any properties of the resolved entity | |
278 | known in advance, for example "title" if the title of the referred video is | |
fed5d032 PH |
279 | known ahead of time. |
280 | ||
281 | ||
282 | _type "url_transparent" entities have the same specification as "url", but | |
283 | indicate that the given additional information is more precise than the one | |
284 | associated with the resolved URL. | |
285 | This is useful when a site employs a video service that hosts the video and | |
286 | its technical metadata, but that video service does not embed a useful | |
287 | title, description etc. | |
288 | ||
289 | ||
d6983cb4 PH |
290 | Subclasses of this one should re-define the _real_initialize() and |
291 | _real_extract() methods and define a _VALID_URL regexp. | |
292 | Probably, they should also be added to the list of extractors. | |
293 | ||
d6983cb4 PH |
294 | Finally, the _WORKING attribute should be set to False for broken IEs |
295 | in order to warn the users and skip the tests. | |
296 | """ | |
297 | ||
298 | _ready = False | |
299 | _downloader = None | |
300 | _WORKING = True | |
301 | ||
302 | def __init__(self, downloader=None): | |
303 | """Constructor. Receives an optional downloader.""" | |
304 | self._ready = False | |
305 | self.set_downloader(downloader) | |
306 | ||
307 | @classmethod | |
308 | def suitable(cls, url): | |
309 | """Receives a URL and returns True if suitable for this IE.""" | |
79cb2577 PH |
310 | |
311 | # This does not use has/getattr intentionally - we want to know whether | |
312 | # we have cached the regexp for *this* class, whereas getattr would also | |
313 | # match the superclass | |
314 | if '_VALID_URL_RE' not in cls.__dict__: | |
315 | cls._VALID_URL_RE = re.compile(cls._VALID_URL) | |
316 | return cls._VALID_URL_RE.match(url) is not None | |
d6983cb4 | 317 | |
ed9266db PH |
318 | @classmethod |
319 | def _match_id(cls, url): | |
320 | if '_VALID_URL_RE' not in cls.__dict__: | |
321 | cls._VALID_URL_RE = re.compile(cls._VALID_URL) | |
322 | m = cls._VALID_URL_RE.match(url) | |
323 | assert m | |
324 | return m.group('id') | |
325 | ||
d6983cb4 PH |
326 | @classmethod |
327 | def working(cls): | |
328 | """Getter method for _WORKING.""" | |
329 | return cls._WORKING | |
330 | ||
331 | def initialize(self): | |
332 | """Initializes an instance (authentication, etc).""" | |
333 | if not self._ready: | |
334 | self._real_initialize() | |
335 | self._ready = True | |
336 | ||
337 | def extract(self, url): | |
338 | """Extracts URL information and returns it in list of dicts.""" | |
3a5bcd03 PH |
339 | try: |
340 | self.initialize() | |
341 | return self._real_extract(url) | |
342 | except ExtractorError: | |
343 | raise | |
344 | except compat_http_client.IncompleteRead as e: | |
dfb1b146 | 345 | raise ExtractorError('A network error has occurred.', cause=e, expected=True) |
9650885b | 346 | except (KeyError, StopIteration) as e: |
dfb1b146 | 347 | raise ExtractorError('An extractor error has occurred.', cause=e) |
d6983cb4 PH |
348 | |
349 | def set_downloader(self, downloader): | |
350 | """Sets the downloader for this IE.""" | |
351 | self._downloader = downloader | |
352 | ||
353 | def _real_initialize(self): | |
354 | """Real initialization process. Redefine in subclasses.""" | |
355 | pass | |
356 | ||
357 | def _real_extract(self, url): | |
358 | """Real extraction process. Redefine in subclasses.""" | |
359 | pass | |
360 | ||
56c73665 JMF |
361 | @classmethod |
362 | def ie_key(cls): | |
363 | """A string for getting the InfoExtractor with get_info_extractor""" | |
dc519b54 | 364 | return compat_str(cls.__name__[:-2]) |
56c73665 | 365 | |
d6983cb4 PH |
366 | @property |
367 | def IE_NAME(self): | |
dc519b54 | 368 | return compat_str(type(self).__name__[:-2]) |
d6983cb4 | 369 | |
41d06b04 | 370 | def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): |
d6983cb4 PH |
371 | """ Returns the response handle """ |
372 | if note is None: | |
373 | self.report_download_webpage(video_id) | |
374 | elif note is not False: | |
7cc3570e | 375 | if video_id is None: |
f1a9d64e | 376 | self.to_screen('%s' % (note,)) |
7cc3570e | 377 | else: |
f1a9d64e | 378 | self.to_screen('%s: %s' % (video_id, note)) |
cdfee168 | 379 | # data, headers and query params will be ignored for `Request` objects |
41d06b04 S |
380 | if isinstance(url_or_request, compat_urllib_request.Request): |
381 | url_or_request = update_Request( | |
382 | url_or_request, data=data, headers=headers, query=query) | |
383 | else: | |
cdfee168 | 384 | if query: |
385 | url_or_request = update_url_query(url_or_request, query) | |
386 | if data or headers: | |
41d06b04 | 387 | url_or_request = sanitized_Request(url_or_request, data, headers) |
d6983cb4 | 388 | try: |
dca08720 | 389 | return self._downloader.urlopen(url_or_request) |
d6983cb4 | 390 | except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: |
aa94a6d3 PH |
391 | if errnote is False: |
392 | return False | |
d6983cb4 | 393 | if errnote is None: |
f1a9d64e | 394 | errnote = 'Unable to download webpage' |
7f8b2714 | 395 | |
9b9c5355 | 396 | errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) |
7cc3570e PH |
397 | if fatal: |
398 | raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) | |
399 | else: | |
400 | self._downloader.report_warning(errmsg) | |
401 | return False | |
d6983cb4 | 402 | |
41d06b04 | 403 | def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): |
d6983cb4 | 404 | """ Returns a tuple (page content as string, URL handle) """ |
b9d3e163 PH |
405 | # Strip hashes from the URL (#1038) |
406 | if isinstance(url_or_request, (compat_str, str)): | |
407 | url_or_request = url_or_request.partition('#')[0] | |
408 | ||
cdfee168 | 409 | urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) |
7cc3570e PH |
410 | if urlh is False: |
411 | assert not fatal | |
412 | return False | |
c9a77969 | 413 | content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) |
23be51d8 PH |
414 | return (content, urlh) |
415 | ||
c9a77969 YCH |
416 | @staticmethod |
417 | def _guess_encoding_from_content(content_type, webpage_bytes): | |
d6983cb4 PH |
418 | m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) |
419 | if m: | |
420 | encoding = m.group(1) | |
421 | else: | |
0d75ae2c | 422 | m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', |
f143d86a PH |
423 | webpage_bytes[:1024]) |
424 | if m: | |
425 | encoding = m.group(1).decode('ascii') | |
b60016e8 PH |
426 | elif webpage_bytes.startswith(b'\xff\xfe'): |
427 | encoding = 'utf-16' | |
f143d86a PH |
428 | else: |
429 | encoding = 'utf-8' | |
c9a77969 YCH |
430 | |
431 | return encoding | |
432 | ||
433 | def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): | |
434 | content_type = urlh.headers.get('Content-Type', '') | |
435 | webpage_bytes = urlh.read() | |
436 | if prefix is not None: | |
437 | webpage_bytes = prefix + webpage_bytes | |
438 | if not encoding: | |
439 | encoding = self._guess_encoding_from_content(content_type, webpage_bytes) | |
d6983cb4 PH |
440 | if self._downloader.params.get('dump_intermediate_pages', False): |
441 | try: | |
442 | url = url_or_request.get_full_url() | |
443 | except AttributeError: | |
444 | url = url_or_request | |
f1a9d64e | 445 | self.to_screen('Dumping request to ' + url) |
d6983cb4 PH |
446 | dump = base64.b64encode(webpage_bytes).decode('ascii') |
447 | self._downloader.to_screen(dump) | |
d41e6efc PH |
448 | if self._downloader.params.get('write_pages', False): |
449 | try: | |
450 | url = url_or_request.get_full_url() | |
451 | except AttributeError: | |
452 | url = url_or_request | |
5afa7f8b | 453 | basen = '%s_%s' % (video_id, url) |
c1bce22f | 454 | if len(basen) > 240: |
f1a9d64e | 455 | h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() |
c1bce22f PH |
456 | basen = basen[:240 - len(h)] + h |
457 | raw_filename = basen + '.dump' | |
d41e6efc | 458 | filename = sanitize_filename(raw_filename, restricted=True) |
f1a9d64e | 459 | self.to_screen('Saving request to ' + filename) |
5f58165d S |
460 | # Working around MAX_PATH limitation on Windows (see |
461 | # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) | |
e9c0cdd3 | 462 | if compat_os_name == 'nt': |
5f58165d S |
463 | absfilepath = os.path.abspath(filename) |
464 | if len(absfilepath) > 259: | |
465 | filename = '\\\\?\\' + absfilepath | |
d41e6efc PH |
466 | with open(filename, 'wb') as outf: |
467 | outf.write(webpage_bytes) | |
468 | ||
ec0fafbb AA |
469 | try: |
470 | content = webpage_bytes.decode(encoding, 'replace') | |
471 | except LookupError: | |
472 | content = webpage_bytes.decode('utf-8', 'replace') | |
2410c43d | 473 | |
f1a9d64e PH |
474 | if ('<title>Access to this site is blocked</title>' in content and |
475 | 'Websense' in content[:512]): | |
476 | msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' | |
2410c43d PH |
477 | blocked_iframe = self._html_search_regex( |
478 | r'<iframe src="([^"]+)"', content, | |
f1a9d64e | 479 | 'Websense information URL', default=None) |
2410c43d | 480 | if blocked_iframe: |
f1a9d64e | 481 | msg += ' Visit %s for more details' % blocked_iframe |
2410c43d | 482 | raise ExtractorError(msg, expected=True) |
77b2986b PH |
483 | if '<title>The URL you requested has been blocked</title>' in content[:512]: |
484 | msg = ( | |
485 | 'Access to this webpage has been blocked by Indian censorship. ' | |
486 | 'Use a VPN or proxy server (with --proxy) to route around it.') | |
487 | block_msg = self._html_search_regex( | |
488 | r'</h1><p>(.*?)</p>', | |
489 | content, 'block message', default=None) | |
490 | if block_msg: | |
491 | msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') | |
492 | raise ExtractorError(msg, expected=True) | |
2410c43d | 493 | |
23be51d8 | 494 | return content |
d6983cb4 | 495 | |
41d06b04 | 496 | def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): |
d6983cb4 | 497 | """ Returns the data of the page as a string """ |
995ad69c TF |
498 | success = False |
499 | try_count = 0 | |
500 | while success is False: | |
501 | try: | |
cdfee168 | 502 | res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) |
995ad69c TF |
503 | success = True |
504 | except compat_http_client.IncompleteRead as e: | |
505 | try_count += 1 | |
506 | if try_count >= tries: | |
507 | raise e | |
508 | self._sleep(timeout, video_id) | |
7cc3570e PH |
509 | if res is False: |
510 | return res | |
511 | else: | |
512 | content, _ = res | |
513 | return content | |
d6983cb4 | 514 | |
2a275ab0 | 515 | def _download_xml(self, url_or_request, video_id, |
f1a9d64e | 516 | note='Downloading XML', errnote='Unable to download XML', |
41d06b04 | 517 | transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): |
267ed0c5 | 518 | """Return the xml as an xml.etree.ElementTree.Element""" |
28746fbd | 519 | xml_string = self._download_webpage( |
cdfee168 | 520 | url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) |
28746fbd PH |
521 | if xml_string is False: |
522 | return xml_string | |
e2b38da9 PH |
523 | if transform_source: |
524 | xml_string = transform_source(xml_string) | |
36e6f62c | 525 | return compat_etree_fromstring(xml_string.encode('utf-8')) |
267ed0c5 | 526 | |
3d3538e4 | 527 | def _download_json(self, url_or_request, video_id, |
f1a9d64e PH |
528 | note='Downloading JSON metadata', |
529 | errnote='Unable to download JSON metadata', | |
b090af59 | 530 | transform_source=None, |
41d06b04 | 531 | fatal=True, encoding=None, data=None, headers={}, query={}): |
b090af59 | 532 | json_string = self._download_webpage( |
c9a77969 | 533 | url_or_request, video_id, note, errnote, fatal=fatal, |
cdfee168 | 534 | encoding=encoding, data=data, headers=headers, query=query) |
b090af59 PH |
535 | if (not fatal) and json_string is False: |
536 | return None | |
ebb64199 TF |
537 | return self._parse_json( |
538 | json_string, video_id, transform_source=transform_source, fatal=fatal) | |
539 | ||
540 | def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): | |
81c2f20b PH |
541 | if transform_source: |
542 | json_string = transform_source(json_string) | |
3d3538e4 PH |
543 | try: |
544 | return json.loads(json_string) | |
545 | except ValueError as ve: | |
e7b6d122 PH |
546 | errmsg = '%s: Failed to parse JSON ' % video_id |
547 | if fatal: | |
548 | raise ExtractorError(errmsg, cause=ve) | |
549 | else: | |
550 | self.report_warning(errmsg + str(ve)) | |
3d3538e4 | 551 | |
f45f96f8 | 552 | def report_warning(self, msg, video_id=None): |
f1a9d64e | 553 | idstr = '' if video_id is None else '%s: ' % video_id |
f45f96f8 | 554 | self._downloader.report_warning( |
f1a9d64e | 555 | '[%s] %s%s' % (self.IE_NAME, idstr, msg)) |
f45f96f8 | 556 | |
d6983cb4 PH |
557 | def to_screen(self, msg): |
558 | """Print msg to screen, prefixing it with '[ie_name]'""" | |
f1a9d64e | 559 | self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) |
d6983cb4 PH |
560 | |
561 | def report_extraction(self, id_or_name): | |
562 | """Report information extraction.""" | |
f1a9d64e | 563 | self.to_screen('%s: Extracting information' % id_or_name) |
d6983cb4 PH |
564 | |
565 | def report_download_webpage(self, video_id): | |
566 | """Report webpage download.""" | |
f1a9d64e | 567 | self.to_screen('%s: Downloading webpage' % video_id) |
d6983cb4 PH |
568 | |
569 | def report_age_confirmation(self): | |
570 | """Report attempt to confirm age.""" | |
f1a9d64e | 571 | self.to_screen('Confirming age') |
d6983cb4 | 572 | |
fc79158d JMF |
573 | def report_login(self): |
574 | """Report attempt to log in.""" | |
f1a9d64e | 575 | self.to_screen('Logging in') |
fc79158d | 576 | |
43e7d3c9 S |
577 | @staticmethod |
578 | def raise_login_required(msg='This video is only available for registered users'): | |
579 | raise ExtractorError( | |
580 | '%s. Use --username and --password or --netrc to provide account credentials.' % msg, | |
581 | expected=True) | |
582 | ||
c430802e S |
583 | @staticmethod |
584 | def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): | |
585 | raise ExtractorError( | |
586 | '%s. You might want to use --proxy to workaround.' % msg, | |
587 | expected=True) | |
588 | ||
5f6a1245 | 589 | # Methods for following #608 |
c0d0b01f | 590 | @staticmethod |
830d53bf | 591 | def url_result(url, ie=None, video_id=None, video_title=None): |
10952eb2 | 592 | """Returns a URL that points to a page that should be processed""" |
5f6a1245 | 593 | # TODO: ie should be the class used for getting the info |
d6983cb4 PH |
594 | video_info = {'_type': 'url', |
595 | 'url': url, | |
596 | 'ie_key': ie} | |
7012b23c PH |
597 | if video_id is not None: |
598 | video_info['id'] = video_id | |
830d53bf S |
599 | if video_title is not None: |
600 | video_info['title'] = video_title | |
d6983cb4 | 601 | return video_info |
5f6a1245 | 602 | |
c0d0b01f | 603 | @staticmethod |
acf5cbfe | 604 | def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): |
d6983cb4 PH |
605 | """Returns a playlist""" |
606 | video_info = {'_type': 'playlist', | |
607 | 'entries': entries} | |
608 | if playlist_id: | |
609 | video_info['id'] = playlist_id | |
610 | if playlist_title: | |
611 | video_info['title'] = playlist_title | |
acf5cbfe S |
612 | if playlist_description: |
613 | video_info['description'] = playlist_description | |
d6983cb4 PH |
614 | return video_info |
615 | ||
c342041f | 616 | def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): |
d6983cb4 PH |
617 | """ |
618 | Perform a regex search on the given string, using a single or a list of | |
619 | patterns returning the first matching group. | |
620 | In case of failure return a default value or raise a WARNING or a | |
55b3e45b | 621 | RegexNotFoundError, depending on fatal, specifying the field name. |
d6983cb4 PH |
622 | """ |
623 | if isinstance(pattern, (str, compat_str, compiled_regex_type)): | |
624 | mobj = re.search(pattern, string, flags) | |
625 | else: | |
626 | for p in pattern: | |
627 | mobj = re.search(p, string, flags) | |
c3415d1b PH |
628 | if mobj: |
629 | break | |
d6983cb4 | 630 | |
e9c0cdd3 | 631 | if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): |
f1a9d64e | 632 | _name = '\033[0;34m%s\033[0m' % name |
d6983cb4 PH |
633 | else: |
634 | _name = name | |
635 | ||
636 | if mobj: | |
711ede6e PH |
637 | if group is None: |
638 | # return the first matching group | |
639 | return next(g for g in mobj.groups() if g is not None) | |
640 | else: | |
641 | return mobj.group(group) | |
c342041f | 642 | elif default is not NO_DEFAULT: |
d6983cb4 PH |
643 | return default |
644 | elif fatal: | |
f1a9d64e | 645 | raise RegexNotFoundError('Unable to extract %s' % _name) |
d6983cb4 | 646 | else: |
08f2a92c | 647 | self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) |
d6983cb4 PH |
648 | return None |
649 | ||
c342041f | 650 | def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): |
d6983cb4 PH |
651 | """ |
652 | Like _search_regex, but strips HTML tags and unescapes entities. | |
653 | """ | |
711ede6e | 654 | res = self._search_regex(pattern, string, name, default, fatal, flags, group) |
d6983cb4 PH |
655 | if res: |
656 | return clean_html(res).strip() | |
657 | else: | |
658 | return res | |
659 | ||
fc79158d JMF |
660 | def _get_login_info(self): |
661 | """ | |
cf0649f8 | 662 | Get the login info as (username, password) |
fc79158d JMF |
663 | It will look in the netrc file using the _NETRC_MACHINE value |
664 | If there's no info available, return (None, None) | |
665 | """ | |
666 | if self._downloader is None: | |
667 | return (None, None) | |
668 | ||
669 | username = None | |
670 | password = None | |
671 | downloader_params = self._downloader.params | |
672 | ||
673 | # Attempt to use provided username and password or .netrc data | |
d800609c | 674 | if downloader_params.get('username') is not None: |
fc79158d JMF |
675 | username = downloader_params['username'] |
676 | password = downloader_params['password'] | |
677 | elif downloader_params.get('usenetrc', False): | |
678 | try: | |
679 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) | |
680 | if info is not None: | |
681 | username = info[0] | |
682 | password = info[2] | |
683 | else: | |
684 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) | |
685 | except (IOError, netrc.NetrcParseError) as err: | |
9b9c5355 | 686 | self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) |
5f6a1245 | 687 | |
fc79158d JMF |
688 | return (username, password) |
689 | ||
e64b7569 | 690 | def _get_tfa_info(self, note='two-factor verification code'): |
83317f69 | 691 | """ |
692 | Get the two-factor authentication info | |
693 | TODO - asking the user will be required for sms/phone verify | |
694 | currently just uses the command line option | |
695 | If there's no info available, return None | |
696 | """ | |
697 | if self._downloader is None: | |
83317f69 | 698 | return None |
699 | downloader_params = self._downloader.params | |
700 | ||
d800609c | 701 | if downloader_params.get('twofactor') is not None: |
83317f69 | 702 | return downloader_params['twofactor'] |
703 | ||
e64b7569 | 704 | return compat_getpass('Type %s and press [Return]: ' % note) |
83317f69 | 705 | |
46720279 JMF |
706 | # Helper functions for extracting OpenGraph info |
707 | @staticmethod | |
ab2d5247 | 708 | def _og_regexes(prop): |
448ef1f3 | 709 | content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' |
7a6d76a6 S |
710 | property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' |
711 | % {'prop': re.escape(prop)}) | |
78fb87b2 | 712 | template = r'<meta[^>]+?%s[^>]+?%s' |
ab2d5247 | 713 | return [ |
78fb87b2 JMF |
714 | template % (property_re, content_re), |
715 | template % (content_re, property_re), | |
ab2d5247 | 716 | ] |
46720279 | 717 | |
864f24bd S |
718 | @staticmethod |
719 | def _meta_regex(prop): | |
720 | return r'''(?isx)<meta | |
8b9848ac | 721 | (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) |
864f24bd S |
722 | [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) |
723 | ||
3c4e6d83 | 724 | def _og_search_property(self, prop, html, name=None, **kargs): |
46720279 | 725 | if name is None: |
3c4e6d83 | 726 | name = 'OpenGraph %s' % prop |
ab2d5247 | 727 | escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) |
eb0a8398 PH |
728 | if escaped is None: |
729 | return None | |
730 | return unescapeHTML(escaped) | |
46720279 JMF |
731 | |
732 | def _og_search_thumbnail(self, html, **kargs): | |
10952eb2 | 733 | return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) |
46720279 JMF |
734 | |
735 | def _og_search_description(self, html, **kargs): | |
736 | return self._og_search_property('description', html, fatal=False, **kargs) | |
737 | ||
738 | def _og_search_title(self, html, **kargs): | |
739 | return self._og_search_property('title', html, **kargs) | |
740 | ||
8ffa13e0 | 741 | def _og_search_video_url(self, html, name='video url', secure=True, **kargs): |
a3681973 PH |
742 | regexes = self._og_regexes('video') + self._og_regexes('video:url') |
743 | if secure: | |
744 | regexes = self._og_regexes('video:secure_url') + regexes | |
8ffa13e0 | 745 | return self._html_search_regex(regexes, html, name, **kargs) |
46720279 | 746 | |
78338f71 JMF |
747 | def _og_search_url(self, html, **kargs): |
748 | return self._og_search_property('url', html, **kargs) | |
749 | ||
40c696e5 | 750 | def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): |
59040888 PH |
751 | if display_name is None: |
752 | display_name = name | |
753 | return self._html_search_regex( | |
864f24bd | 754 | self._meta_regex(name), |
711ede6e | 755 | html, display_name, fatal=fatal, group='content', **kwargs) |
59040888 PH |
756 | |
757 | def _dc_search_uploader(self, html): | |
758 | return self._html_search_meta('dc.creator', html, 'uploader') | |
759 | ||
8dbe9899 PH |
760 | def _rta_search(self, html): |
761 | # See http://www.rtalabel.org/index.php?content=howtofaq#single | |
762 | if re.search(r'(?ix)<meta\s+name="rating"\s+' | |
763 | r' content="RTA-5042-1996-1400-1577-RTA"', | |
764 | html): | |
765 | return 18 | |
766 | return 0 | |
767 | ||
59040888 PH |
768 | def _media_rating_search(self, html): |
769 | # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ | |
770 | rating = self._html_search_meta('rating', html) | |
771 | ||
772 | if not rating: | |
773 | return None | |
774 | ||
775 | RATING_TABLE = { | |
776 | 'safe for kids': 0, | |
777 | 'general': 8, | |
778 | '14 years': 14, | |
779 | 'mature': 17, | |
780 | 'restricted': 19, | |
781 | } | |
d800609c | 782 | return RATING_TABLE.get(rating.lower()) |
59040888 | 783 | |
69319969 | 784 | def _family_friendly_search(self, html): |
6ca7732d | 785 | # See http://schema.org/VideoObject |
69319969 NJ |
786 | family_friendly = self._html_search_meta('isFamilyFriendly', html) |
787 | ||
788 | if not family_friendly: | |
789 | return None | |
790 | ||
791 | RATING_TABLE = { | |
792 | '1': 0, | |
793 | 'true': 0, | |
794 | '0': 18, | |
795 | 'false': 18, | |
796 | } | |
d800609c | 797 | return RATING_TABLE.get(family_friendly.lower()) |
69319969 | 798 | |
0c708f11 JMF |
799 | def _twitter_search_player(self, html): |
800 | return self._html_search_meta('twitter:player', html, | |
9e1a5b84 | 801 | 'twitter card player') |
0c708f11 | 802 | |
0b26ba3f | 803 | def _search_json_ld(self, html, video_id, **kwargs): |
4ca2a3cf S |
804 | json_ld = self._search_regex( |
805 | r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', | |
0b26ba3f | 806 | html, 'JSON-LD', group='json_ld', **kwargs) |
4ca2a3cf S |
807 | if not json_ld: |
808 | return {} | |
0b26ba3f | 809 | return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) |
4ca2a3cf S |
810 | |
811 | def _json_ld(self, json_ld, video_id, fatal=True): | |
812 | if isinstance(json_ld, compat_str): | |
813 | json_ld = self._parse_json(json_ld, video_id, fatal=fatal) | |
814 | if not json_ld: | |
815 | return {} | |
816 | info = {} | |
817 | if json_ld.get('@context') == 'http://schema.org': | |
818 | item_type = json_ld.get('@type') | |
819 | if item_type == 'TVEpisode': | |
820 | info.update({ | |
821 | 'episode': unescapeHTML(json_ld.get('name')), | |
822 | 'episode_number': int_or_none(json_ld.get('episodeNumber')), | |
823 | 'description': unescapeHTML(json_ld.get('description')), | |
824 | }) | |
825 | part_of_season = json_ld.get('partOfSeason') | |
826 | if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': | |
827 | info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) | |
828 | part_of_series = json_ld.get('partOfSeries') | |
829 | if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': | |
830 | info['series'] = unescapeHTML(part_of_series.get('name')) | |
831 | elif item_type == 'Article': | |
832 | info.update({ | |
833 | 'timestamp': parse_iso8601(json_ld.get('datePublished')), | |
834 | 'title': unescapeHTML(json_ld.get('headline')), | |
835 | 'description': unescapeHTML(json_ld.get('articleBody')), | |
836 | }) | |
837 | return dict((k, v) for k, v in info.items() if v is not None) | |
838 | ||
27713812 | 839 | @staticmethod |
f8da79f8 | 840 | def _hidden_inputs(html): |
586f1cc5 | 841 | html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) |
201ea3ee | 842 | hidden_inputs = {} |
73eb13df | 843 | for input in re.findall(r'(?i)<input([^>]+)>', html): |
be0e5dbd | 844 | if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): |
201ea3ee | 845 | continue |
bacec039 | 846 | name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input) |
201ea3ee S |
847 | if not name: |
848 | continue | |
849 | value = re.search(r'value=(["\'])(?P<value>.*?)\1', input) | |
850 | if not value: | |
851 | continue | |
852 | hidden_inputs[name.group('value')] = value.group('value') | |
853 | return hidden_inputs | |
27713812 | 854 | |
cf61d96d S |
855 | def _form_hidden_inputs(self, form_id, html): |
856 | form = self._search_regex( | |
73eb13df | 857 | r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, |
cf61d96d S |
858 | html, '%s form' % form_id, group='form') |
859 | return self._hidden_inputs(form) | |
860 | ||
3ded7bac | 861 | def _sort_formats(self, formats, field_preference=None): |
7e8caf30 | 862 | if not formats: |
f1a9d64e | 863 | raise ExtractorError('No video formats found') |
7e8caf30 | 864 | |
b0d21ded S |
865 | for f in formats: |
866 | # Automatically determine tbr when missing based on abr and vbr (improves | |
867 | # formats sorting in some cases) | |
350cf045 | 868 | if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: |
b0d21ded S |
869 | f['tbr'] = f['abr'] + f['vbr'] |
870 | ||
4bcc7bd1 | 871 | def _formats_key(f): |
e6812ac9 PH |
872 | # TODO remove the following workaround |
873 | from ..utils import determine_ext | |
874 | if not f.get('ext') and 'url' in f: | |
875 | f['ext'] = determine_ext(f['url']) | |
876 | ||
3ded7bac S |
877 | if isinstance(field_preference, (list, tuple)): |
878 | return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) | |
879 | ||
4bcc7bd1 PH |
880 | preference = f.get('preference') |
881 | if preference is None: | |
d497a201 | 882 | preference = 0 |
4bcc7bd1 PH |
883 | if f.get('ext') in ['f4f', 'f4m']: # Not yet supported |
884 | preference -= 0.5 | |
885 | ||
d497a201 | 886 | proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 |
887 | ||
4bcc7bd1 | 888 | if f.get('vcodec') == 'none': # audio only |
dd867805 | 889 | preference -= 50 |
4bcc7bd1 | 890 | if self._downloader.params.get('prefer_free_formats'): |
f1a9d64e | 891 | ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] |
4bcc7bd1 | 892 | else: |
f1a9d64e | 893 | ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] |
4bcc7bd1 PH |
894 | ext_preference = 0 |
895 | try: | |
896 | audio_ext_preference = ORDER.index(f['ext']) | |
897 | except ValueError: | |
898 | audio_ext_preference = -1 | |
899 | else: | |
dd867805 | 900 | if f.get('acodec') == 'none': # video only |
901 | preference -= 40 | |
4bcc7bd1 | 902 | if self._downloader.params.get('prefer_free_formats'): |
f1a9d64e | 903 | ORDER = ['flv', 'mp4', 'webm'] |
4bcc7bd1 | 904 | else: |
f1a9d64e | 905 | ORDER = ['webm', 'flv', 'mp4'] |
4bcc7bd1 PH |
906 | try: |
907 | ext_preference = ORDER.index(f['ext']) | |
908 | except ValueError: | |
909 | ext_preference = -1 | |
910 | audio_ext_preference = 0 | |
911 | ||
912 | return ( | |
913 | preference, | |
aff2f4f4 | 914 | f.get('language_preference') if f.get('language_preference') is not None else -1, |
5d73273f | 915 | f.get('quality') if f.get('quality') is not None else -1, |
9933b574 | 916 | f.get('tbr') if f.get('tbr') is not None else -1, |
03cd72b0 | 917 | f.get('filesize') if f.get('filesize') is not None else -1, |
4bcc7bd1 | 918 | f.get('vbr') if f.get('vbr') is not None else -1, |
1a6373ef PH |
919 | f.get('height') if f.get('height') is not None else -1, |
920 | f.get('width') if f.get('width') is not None else -1, | |
d497a201 | 921 | proto_preference, |
1e1896f2 | 922 | ext_preference, |
4bcc7bd1 PH |
923 | f.get('abr') if f.get('abr') is not None else -1, |
924 | audio_ext_preference, | |
2c8e03d9 | 925 | f.get('fps') if f.get('fps') is not None else -1, |
9732d77e | 926 | f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, |
c64ed2a3 | 927 | f.get('source_preference') if f.get('source_preference') is not None else -1, |
74f72824 | 928 | f.get('format_id') if f.get('format_id') is not None else '', |
4bcc7bd1 PH |
929 | ) |
930 | formats.sort(key=_formats_key) | |
59040888 | 931 | |
96a53167 S |
932 | def _check_formats(self, formats, video_id): |
933 | if formats: | |
934 | formats[:] = filter( | |
935 | lambda f: self._is_valid_url( | |
936 | f['url'], video_id, | |
937 | item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), | |
938 | formats) | |
939 | ||
f5bdb444 S |
940 | @staticmethod |
941 | def _remove_duplicate_formats(formats): | |
942 | format_urls = set() | |
943 | unique_formats = [] | |
944 | for f in formats: | |
945 | if f['url'] not in format_urls: | |
946 | format_urls.add(f['url']) | |
947 | unique_formats.append(f) | |
948 | formats[:] = unique_formats | |
949 | ||
96a53167 | 950 | def _is_valid_url(self, url, video_id, item='video'): |
2f0f6578 S |
951 | url = self._proto_relative_url(url, scheme='http:') |
952 | # For now assume non HTTP(S) URLs always valid | |
953 | if not (url.startswith('http://') or url.startswith('https://')): | |
954 | return True | |
96a53167 | 955 | try: |
4069766c | 956 | self._request_webpage(url, video_id, 'Checking %s URL' % item) |
96a53167 S |
957 | return True |
958 | except ExtractorError as e: | |
943a1e24 | 959 | if isinstance(e.cause, compat_urllib_error.URLError): |
baa43cba S |
960 | self.to_screen( |
961 | '%s: %s URL is invalid, skipping' % (video_id, item)) | |
96a53167 S |
962 | return False |
963 | raise | |
964 | ||
20991253 | 965 | def http_scheme(self): |
1ede5b24 | 966 | """ Either "http:" or "https:", depending on the user's preferences """ |
20991253 PH |
967 | return ( |
968 | 'http:' | |
969 | if self._downloader.params.get('prefer_insecure', False) | |
970 | else 'https:') | |
971 | ||
57c7411f PH |
972 | def _proto_relative_url(self, url, scheme=None): |
973 | if url is None: | |
974 | return url | |
975 | if url.startswith('//'): | |
976 | if scheme is None: | |
977 | scheme = self.http_scheme() | |
978 | return scheme + url | |
979 | else: | |
980 | return url | |
981 | ||
4094b6e3 PH |
982 | def _sleep(self, timeout, video_id, msg_template=None): |
983 | if msg_template is None: | |
f1a9d64e | 984 | msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' |
4094b6e3 PH |
985 | msg = msg_template % {'video_id': video_id, 'timeout': timeout} |
986 | self.to_screen(msg) | |
987 | time.sleep(timeout) | |
988 | ||
a38436e8 | 989 | def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, |
4de61310 S |
990 | transform_source=lambda s: fix_xml_ampersands(s).strip(), |
991 | fatal=True): | |
f036a632 JMF |
992 | manifest = self._download_xml( |
993 | manifest_url, video_id, 'Downloading f4m manifest', | |
97f4aecf S |
994 | 'Unable to download f4m manifest', |
995 | # Some manifests may be malformed, e.g. prosiebensat1 generated manifests | |
996 | # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) | |
4de61310 S |
997 | transform_source=transform_source, |
998 | fatal=fatal) | |
999 | ||
1000 | if manifest is False: | |
8d29e47f | 1001 | return [] |
31bb8d3f | 1002 | |
0fdbb332 S |
1003 | return self._parse_f4m_formats( |
1004 | manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, | |
1005 | transform_source=transform_source, fatal=fatal) | |
1006 | ||
1007 | def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, | |
1008 | transform_source=lambda s: fix_xml_ampersands(s).strip(), | |
1009 | fatal=True): | |
31bb8d3f | 1010 | formats = [] |
7a47d07c | 1011 | manifest_version = '1.0' |
b2527359 | 1012 | media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') |
34e48bed | 1013 | if not media_nodes: |
7a47d07c | 1014 | manifest_version = '2.0' |
34e48bed | 1015 | media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') |
b22ca762 S |
1016 | # Remove unsupported DRM protected media from final formats |
1017 | # rendition (see https://github.com/rg3/youtube-dl/issues/8573). | |
1018 | media_nodes = remove_encrypted_media(media_nodes) | |
1019 | if not media_nodes: | |
1020 | return formats | |
019839fa S |
1021 | base_url = xpath_text( |
1022 | manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], | |
1023 | 'base URL', default=None) | |
1024 | if base_url: | |
1025 | base_url = base_url.strip() | |
b2527359 | 1026 | for i, media_el in enumerate(media_nodes): |
7a47d07c | 1027 | if manifest_version == '2.0': |
31c746e5 S |
1028 | media_url = media_el.attrib.get('href') or media_el.attrib.get('url') |
1029 | if not media_url: | |
1030 | continue | |
cc357c4d S |
1031 | manifest_url = ( |
1032 | media_url if media_url.startswith('http://') or media_url.startswith('https://') | |
019839fa | 1033 | else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) |
70f0f5a8 S |
1034 | # If media_url is itself a f4m manifest do the recursive extraction |
1035 | # since bitrates in parent manifest (this one) and media_url manifest | |
1036 | # may differ leading to inability to resolve the format by requested | |
1037 | # bitrate in f4m downloader | |
1038 | if determine_ext(manifest_url) == 'f4m': | |
7e5edcfd | 1039 | formats.extend(self._extract_f4m_formats( |
0fdbb332 S |
1040 | manifest_url, video_id, preference=preference, f4m_id=f4m_id, |
1041 | transform_source=transform_source, fatal=fatal)) | |
70f0f5a8 | 1042 | continue |
b2527359 | 1043 | tbr = int_or_none(media_el.attrib.get('bitrate')) |
31bb8d3f | 1044 | formats.append({ |
e21a55ab | 1045 | 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), |
31bb8d3f JMF |
1046 | 'url': manifest_url, |
1047 | 'ext': 'flv', | |
b2527359 | 1048 | 'tbr': tbr, |
31bb8d3f JMF |
1049 | 'width': int_or_none(media_el.attrib.get('width')), |
1050 | 'height': int_or_none(media_el.attrib.get('height')), | |
60ca389c | 1051 | 'preference': preference, |
31bb8d3f | 1052 | }) |
31bb8d3f JMF |
1053 | return formats |
1054 | ||
f0b5d6af | 1055 | def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, |
60ca389c | 1056 | entry_protocol='m3u8', preference=None, |
13af92fd YCH |
1057 | m3u8_id=None, note=None, errnote=None, |
1058 | fatal=True): | |
f0b5d6af | 1059 | |
704df56d | 1060 | formats = [{ |
f207019c | 1061 | 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), |
704df56d PH |
1062 | 'url': m3u8_url, |
1063 | 'ext': ext, | |
1064 | 'protocol': 'm3u8', | |
9fe6ef7a | 1065 | 'preference': preference - 1 if preference else -1, |
704df56d PH |
1066 | 'resolution': 'multiple', |
1067 | 'format_note': 'Quality selection URL', | |
1068 | }] | |
1069 | ||
f0b5d6af PH |
1070 | format_url = lambda u: ( |
1071 | u | |
1072 | if re.match(r'^https?://', u) | |
1073 | else compat_urlparse.urljoin(m3u8_url, u)) | |
1074 | ||
dbd82a1d | 1075 | res = self._download_webpage_handle( |
81515ad9 | 1076 | m3u8_url, video_id, |
621ed9f5 | 1077 | note=note or 'Downloading m3u8 information', |
13af92fd YCH |
1078 | errnote=errnote or 'Failed to download m3u8 information', |
1079 | fatal=fatal) | |
dbd82a1d | 1080 | if res is False: |
8d29e47f | 1081 | return [] |
dbd82a1d | 1082 | m3u8_doc, urlh = res |
37113045 | 1083 | m3u8_url = urlh.geturl() |
9cdffeeb S |
1084 | |
1085 | # We should try extracting formats only from master playlists [1], i.e. | |
1086 | # playlists that describe available qualities. On the other hand media | |
1087 | # playlists [2] should be returned as is since they contain just the media | |
1088 | # without qualities renditions. | |
1089 | # Fortunately, master playlist can be easily distinguished from media | |
1090 | # playlist based on particular tags availability. As of [1, 2] master | |
1091 | # playlist tags MUST NOT appear in a media playist and vice versa. | |
1092 | # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist | |
1093 | # and MUST NOT appear in master playlist thus we can clearly detect media | |
1094 | # playlist with this criterion. | |
1095 | # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 | |
1096 | # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 | |
1097 | # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 | |
1098 | if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is | |
7f32e5dc | 1099 | return [{ |
1100 | 'url': m3u8_url, | |
1101 | 'format_id': m3u8_id, | |
1102 | 'ext': ext, | |
1103 | 'protocol': entry_protocol, | |
1104 | 'preference': preference, | |
1105 | }] | |
704df56d | 1106 | last_info = None |
fa156077 | 1107 | last_media = None |
704df56d PH |
1108 | kv_rex = re.compile( |
1109 | r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') | |
1110 | for line in m3u8_doc.splitlines(): | |
1111 | if line.startswith('#EXT-X-STREAM-INF:'): | |
1112 | last_info = {} | |
1113 | for m in kv_rex.finditer(line): | |
1114 | v = m.group('val') | |
1115 | if v.startswith('"'): | |
1116 | v = v[1:-1] | |
1117 | last_info[m.group('key')] = v | |
4cd95bcb JMF |
1118 | elif line.startswith('#EXT-X-MEDIA:'): |
1119 | last_media = {} | |
1120 | for m in kv_rex.finditer(line): | |
1121 | v = m.group('val') | |
1122 | if v.startswith('"'): | |
1123 | v = v[1:-1] | |
1124 | last_media[m.group('key')] = v | |
704df56d PH |
1125 | elif line.startswith('#') or not line.strip(): |
1126 | continue | |
1127 | else: | |
daebaab6 | 1128 | if last_info is None: |
f0b5d6af | 1129 | formats.append({'url': format_url(line)}) |
3524cc25 | 1130 | continue |
704df56d | 1131 | tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) |
8dc9d361 S |
1132 | format_id = [] |
1133 | if m3u8_id: | |
1134 | format_id.append(m3u8_id) | |
05d5392c | 1135 | last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None |
8dc9d361 | 1136 | format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) |
704df56d | 1137 | f = { |
8dc9d361 | 1138 | 'format_id': '-'.join(format_id), |
f0b5d6af | 1139 | 'url': format_url(line.strip()), |
704df56d PH |
1140 | 'tbr': tbr, |
1141 | 'ext': ext, | |
f0b5d6af PH |
1142 | 'protocol': entry_protocol, |
1143 | 'preference': preference, | |
704df56d | 1144 | } |
704df56d PH |
1145 | resolution = last_info.get('RESOLUTION') |
1146 | if resolution: | |
1147 | width_str, height_str = resolution.split('x') | |
1148 | f['width'] = int(width_str) | |
1149 | f['height'] = int(height_str) | |
fbb6edd2 S |
1150 | codecs = last_info.get('CODECS') |
1151 | if codecs: | |
1152 | vcodec, acodec = [None] * 2 | |
1153 | va_codecs = codecs.split(',') | |
1154 | if len(va_codecs) == 1: | |
1155 | # Audio only entries usually come with single codec and | |
1156 | # no resolution. For more robustness we also check it to | |
1157 | # be mp4 audio. | |
1158 | if not resolution and va_codecs[0].startswith('mp4a'): | |
1159 | vcodec, acodec = 'none', va_codecs[0] | |
1160 | else: | |
1161 | vcodec = va_codecs[0] | |
1162 | else: | |
1163 | vcodec, acodec = va_codecs[:2] | |
1164 | f.update({ | |
1165 | 'acodec': acodec, | |
1166 | 'vcodec': vcodec, | |
1167 | }) | |
4cd95bcb JMF |
1168 | if last_media is not None: |
1169 | f['m3u8_media'] = last_media | |
1170 | last_media = None | |
704df56d PH |
1171 | formats.append(f) |
1172 | last_info = {} | |
704df56d PH |
1173 | return formats |
1174 | ||
a107193e S |
1175 | @staticmethod |
1176 | def _xpath_ns(path, namespace=None): | |
1177 | if not namespace: | |
1178 | return path | |
1179 | out = [] | |
1180 | for c in path.split('/'): | |
1181 | if not c or c == '.': | |
1182 | out.append(c) | |
1183 | else: | |
1184 | out.append('{%s}%s' % (namespace, c)) | |
1185 | return '/'.join(out) | |
1186 | ||
09f572fb | 1187 | def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): |
1188 | smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) | |
a107193e | 1189 | |
995029a1 PH |
1190 | if smil is False: |
1191 | assert not fatal | |
1192 | return [] | |
e89a2aab | 1193 | |
17712eeb | 1194 | namespace = self._parse_smil_namespace(smil) |
a107193e S |
1195 | |
1196 | return self._parse_smil_formats( | |
1197 | smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | |
1198 | ||
1199 | def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): | |
1200 | smil = self._download_smil(smil_url, video_id, fatal=fatal) | |
1201 | if smil is False: | |
1202 | return {} | |
1203 | return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) | |
1204 | ||
09f572fb | 1205 | def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): |
a107193e S |
1206 | return self._download_xml( |
1207 | smil_url, video_id, 'Downloading SMIL file', | |
09f572fb | 1208 | 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) |
a107193e S |
1209 | |
1210 | def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): | |
17712eeb | 1211 | namespace = self._parse_smil_namespace(smil) |
a107193e S |
1212 | |
1213 | formats = self._parse_smil_formats( | |
1214 | smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | |
1215 | subtitles = self._parse_smil_subtitles(smil, namespace=namespace) | |
1216 | ||
1217 | video_id = os.path.splitext(url_basename(smil_url))[0] | |
1218 | title = None | |
1219 | description = None | |
647eab45 | 1220 | upload_date = None |
a107193e S |
1221 | for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): |
1222 | name = meta.attrib.get('name') | |
1223 | content = meta.attrib.get('content') | |
1224 | if not name or not content: | |
1225 | continue | |
1226 | if not title and name == 'title': | |
1227 | title = content | |
1228 | elif not description and name in ('description', 'abstract'): | |
1229 | description = content | |
647eab45 S |
1230 | elif not upload_date and name == 'date': |
1231 | upload_date = unified_strdate(content) | |
a107193e | 1232 | |
1e5bcdec S |
1233 | thumbnails = [{ |
1234 | 'id': image.get('type'), | |
1235 | 'url': image.get('src'), | |
1236 | 'width': int_or_none(image.get('width')), | |
1237 | 'height': int_or_none(image.get('height')), | |
1238 | } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] | |
1239 | ||
a107193e S |
1240 | return { |
1241 | 'id': video_id, | |
1242 | 'title': title or video_id, | |
1243 | 'description': description, | |
647eab45 | 1244 | 'upload_date': upload_date, |
1e5bcdec | 1245 | 'thumbnails': thumbnails, |
a107193e S |
1246 | 'formats': formats, |
1247 | 'subtitles': subtitles, | |
1248 | } | |
1249 | ||
17712eeb S |
1250 | def _parse_smil_namespace(self, smil): |
1251 | return self._search_regex( | |
1252 | r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) | |
1253 | ||
f877c6ae | 1254 | def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): |
a107193e S |
1255 | base = smil_url |
1256 | for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): | |
1257 | b = meta.get('base') or meta.get('httpBase') | |
1258 | if b: | |
1259 | base = b | |
1260 | break | |
e89a2aab S |
1261 | |
1262 | formats = [] | |
1263 | rtmp_count = 0 | |
a107193e | 1264 | http_count = 0 |
7f32e5dc | 1265 | m3u8_count = 0 |
a107193e | 1266 | |
81e1c4e2 | 1267 | srcs = [] |
a107193e S |
1268 | videos = smil.findall(self._xpath_ns('.//video', namespace)) |
1269 | for video in videos: | |
1270 | src = video.get('src') | |
81e1c4e2 | 1271 | if not src or src in srcs: |
a107193e | 1272 | continue |
81e1c4e2 | 1273 | srcs.append(src) |
a107193e | 1274 | |
e7d8e98a | 1275 | bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) |
a107193e S |
1276 | filesize = int_or_none(video.get('size') or video.get('fileSize')) |
1277 | width = int_or_none(video.get('width')) | |
1278 | height = int_or_none(video.get('height')) | |
1279 | proto = video.get('proto') | |
1280 | ext = video.get('ext') | |
1281 | src_ext = determine_ext(src) | |
1282 | streamer = video.get('streamer') or base | |
1283 | ||
1284 | if proto == 'rtmp' or streamer.startswith('rtmp'): | |
1285 | rtmp_count += 1 | |
1286 | formats.append({ | |
1287 | 'url': streamer, | |
1288 | 'play_path': src, | |
1289 | 'ext': 'flv', | |
1290 | 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), | |
1291 | 'tbr': bitrate, | |
1292 | 'filesize': filesize, | |
1293 | 'width': width, | |
1294 | 'height': height, | |
1295 | }) | |
f877c6ae YCH |
1296 | if transform_rtmp_url: |
1297 | streamer, src = transform_rtmp_url(streamer, src) | |
1298 | formats[-1].update({ | |
1299 | 'url': streamer, | |
1300 | 'play_path': src, | |
1301 | }) | |
a107193e S |
1302 | continue |
1303 | ||
1304 | src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) | |
c349456e | 1305 | src_url = src_url.strip() |
a107193e S |
1306 | |
1307 | if proto == 'm3u8' or src_ext == 'm3u8': | |
7f32e5dc | 1308 | m3u8_formats = self._extract_m3u8_formats( |
1309 | src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) | |
1310 | if len(m3u8_formats) == 1: | |
1311 | m3u8_count += 1 | |
1312 | m3u8_formats[0].update({ | |
1313 | 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), | |
1314 | 'tbr': bitrate, | |
1315 | 'width': width, | |
1316 | 'height': height, | |
1317 | }) | |
1318 | formats.extend(m3u8_formats) | |
a107193e S |
1319 | continue |
1320 | ||
1321 | if src_ext == 'f4m': | |
1322 | f4m_url = src_url | |
1323 | if not f4m_params: | |
1324 | f4m_params = { | |
1325 | 'hdcore': '3.2.0', | |
1326 | 'plugin': 'flowplayer-3.2.0.1', | |
1327 | } | |
1328 | f4m_url += '&' if '?' in f4m_url else '?' | |
15707c7e | 1329 | f4m_url += compat_urllib_parse_urlencode(f4m_params) |
7e5edcfd | 1330 | formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) |
a107193e S |
1331 | continue |
1332 | ||
c78e4817 | 1333 | if src_url.startswith('http') and self._is_valid_url(src, video_id): |
a107193e S |
1334 | http_count += 1 |
1335 | formats.append({ | |
1336 | 'url': src_url, | |
1337 | 'ext': ext or src_ext or 'flv', | |
1338 | 'format_id': 'http-%d' % (bitrate or http_count), | |
1339 | 'tbr': bitrate, | |
1340 | 'filesize': filesize, | |
1341 | 'width': width, | |
1342 | 'height': height, | |
1343 | }) | |
1344 | continue | |
63757032 | 1345 | |
e89a2aab S |
1346 | return formats |
1347 | ||
ce00af87 | 1348 | def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): |
d413095f | 1349 | urls = [] |
a107193e S |
1350 | subtitles = {} |
1351 | for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): | |
1352 | src = textstream.get('src') | |
d413095f | 1353 | if not src or src in urls: |
a107193e | 1354 | continue |
d413095f | 1355 | urls.append(src) |
df634be2 | 1356 | ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) |
03bc7237 | 1357 | lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang |
a107193e S |
1358 | subtitles.setdefault(lang, []).append({ |
1359 | 'url': src, | |
1360 | 'ext': ext, | |
1361 | }) | |
1362 | return subtitles | |
63757032 | 1363 | |
942acef5 S |
1364 | def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): |
1365 | xspf = self._download_xml( | |
8d6765cf | 1366 | playlist_url, playlist_id, 'Downloading xpsf playlist', |
942acef5 S |
1367 | 'Unable to download xspf manifest', fatal=fatal) |
1368 | if xspf is False: | |
1369 | return [] | |
1370 | return self._parse_xspf(xspf, playlist_id) | |
8d6765cf | 1371 | |
942acef5 | 1372 | def _parse_xspf(self, playlist, playlist_id): |
8d6765cf S |
1373 | NS_MAP = { |
1374 | 'xspf': 'http://xspf.org/ns/0/', | |
1375 | 's1': 'http://static.streamone.nl/player/ns/0', | |
1376 | } | |
1377 | ||
1378 | entries = [] | |
1379 | for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): | |
1380 | title = xpath_text( | |
98044462 | 1381 | track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) |
8d6765cf S |
1382 | description = xpath_text( |
1383 | track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') | |
1384 | thumbnail = xpath_text( | |
1385 | track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') | |
1386 | duration = float_or_none( | |
1387 | xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) | |
1388 | ||
1389 | formats = [{ | |
1390 | 'url': location.text, | |
1391 | 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), | |
1392 | 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), | |
1393 | 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), | |
1394 | } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] | |
1395 | self._sort_formats(formats) | |
1396 | ||
1397 | entries.append({ | |
1398 | 'id': playlist_id, | |
1399 | 'title': title, | |
1400 | 'description': description, | |
1401 | 'thumbnail': thumbnail, | |
1402 | 'duration': duration, | |
1403 | 'formats': formats, | |
1404 | }) | |
1405 | return entries | |
1406 | ||
1bac3455 | 1407 | def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): |
1408 | res = self._download_webpage_handle( | |
1409 | mpd_url, video_id, | |
1410 | note=note or 'Downloading MPD manifest', | |
1411 | errnote=errnote or 'Failed to download MPD manifest', | |
2d2fa82d | 1412 | fatal=fatal) |
1bac3455 | 1413 | if res is False: |
2d2fa82d | 1414 | return [] |
1bac3455 | 1415 | mpd, urlh = res |
1416 | mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() | |
1417 | ||
91cb6b50 | 1418 | return self._parse_mpd_formats( |
1bac3455 | 1419 | compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) |
2d2fa82d | 1420 | |
91cb6b50 | 1421 | def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): |
1bac3455 | 1422 | if mpd_doc.get('type') == 'dynamic': |
1423 | return [] | |
2d2fa82d | 1424 | |
91cb6b50 | 1425 | namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) |
f14be228 | 1426 | |
1427 | def _add_ns(path): | |
1428 | return self._xpath_ns(path, namespace) | |
1429 | ||
675d0016 | 1430 | def is_drm_protected(element): |
1431 | return element.find(_add_ns('ContentProtection')) is not None | |
1432 | ||
1bac3455 | 1433 | def extract_multisegment_info(element, ms_parent_info): |
1434 | ms_info = ms_parent_info.copy() | |
f14be228 | 1435 | segment_list = element.find(_add_ns('SegmentList')) |
1bac3455 | 1436 | if segment_list is not None: |
f14be228 | 1437 | segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) |
1bac3455 | 1438 | if segment_urls_e: |
1439 | ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] | |
f14be228 | 1440 | initialization = segment_list.find(_add_ns('Initialization')) |
1bac3455 | 1441 | if initialization is not None: |
1442 | ms_info['initialization_url'] = initialization.attrib['sourceURL'] | |
1443 | else: | |
f14be228 | 1444 | segment_template = element.find(_add_ns('SegmentTemplate')) |
1bac3455 | 1445 | if segment_template is not None: |
1446 | start_number = segment_template.get('startNumber') | |
1447 | if start_number: | |
1448 | ms_info['start_number'] = int(start_number) | |
f14be228 | 1449 | segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) |
1bac3455 | 1450 | if segment_timeline is not None: |
f14be228 | 1451 | s_e = segment_timeline.findall(_add_ns('S')) |
1bac3455 | 1452 | if s_e: |
1453 | ms_info['total_number'] = 0 | |
1454 | for s in s_e: | |
1455 | ms_info['total_number'] += 1 + int(s.get('r', '0')) | |
1456 | else: | |
1457 | timescale = segment_template.get('timescale') | |
1458 | if timescale: | |
1459 | ms_info['timescale'] = int(timescale) | |
1460 | segment_duration = segment_template.get('duration') | |
1461 | if segment_duration: | |
1462 | ms_info['segment_duration'] = int(segment_duration) | |
1463 | media_template = segment_template.get('media') | |
1464 | if media_template: | |
1465 | ms_info['media_template'] = media_template | |
1466 | initialization = segment_template.get('initialization') | |
1467 | if initialization: | |
1468 | ms_info['initialization_url'] = initialization | |
1469 | else: | |
f14be228 | 1470 | initialization = segment_template.find(_add_ns('Initialization')) |
1bac3455 | 1471 | if initialization is not None: |
1472 | ms_info['initialization_url'] = initialization.attrib['sourceURL'] | |
1473 | return ms_info | |
b323e170 | 1474 | |
1bac3455 | 1475 | mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) |
17b598d3 | 1476 | formats = [] |
f14be228 | 1477 | for period in mpd_doc.findall(_add_ns('Period')): |
1bac3455 | 1478 | period_duration = parse_duration(period.get('duration')) or mpd_duration |
1479 | period_ms_info = extract_multisegment_info(period, { | |
1480 | 'start_number': 1, | |
1481 | 'timescale': 1, | |
1482 | }) | |
f14be228 | 1483 | for adaptation_set in period.findall(_add_ns('AdaptationSet')): |
675d0016 | 1484 | if is_drm_protected(adaptation_set): |
1485 | continue | |
1bac3455 | 1486 | adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) |
f14be228 | 1487 | for representation in adaptation_set.findall(_add_ns('Representation')): |
675d0016 | 1488 | if is_drm_protected(representation): |
1489 | continue | |
1bac3455 | 1490 | representation_attrib = adaptation_set.attrib.copy() |
1491 | representation_attrib.update(representation.attrib) | |
a6c8b759 YCH |
1492 | # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory |
1493 | mime_type = representation_attrib['mimeType'] | |
1494 | content_type = mime_type.split('/')[0] | |
1bac3455 | 1495 | if content_type == 'text': |
1496 | # TODO implement WebVTT downloading | |
1497 | pass | |
1498 | elif content_type == 'video' or content_type == 'audio': | |
1499 | base_url = '' | |
1500 | for element in (representation, adaptation_set, period, mpd_doc): | |
f14be228 | 1501 | base_url_e = element.find(_add_ns('BaseURL')) |
1bac3455 | 1502 | if base_url_e is not None: |
1503 | base_url = base_url_e.text + base_url | |
1504 | if re.match(r'^https?://', base_url): | |
1505 | break | |
bb20526b S |
1506 | if mpd_base_url and not re.match(r'^https?://', base_url): |
1507 | if not mpd_base_url.endswith('/') and not base_url.startswith('/'): | |
1508 | mpd_base_url += '/' | |
1bac3455 | 1509 | base_url = mpd_base_url + base_url |
1510 | representation_id = representation_attrib.get('id') | |
d577c796 | 1511 | lang = representation_attrib.get('lang') |
51e9094f | 1512 | url_el = representation.find(_add_ns('BaseURL')) |
1513 | filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) | |
1bac3455 | 1514 | f = { |
154c209e | 1515 | 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, |
1bac3455 | 1516 | 'url': base_url, |
a6c8b759 | 1517 | 'ext': mimetype2ext(mime_type), |
1bac3455 | 1518 | 'width': int_or_none(representation_attrib.get('width')), |
1519 | 'height': int_or_none(representation_attrib.get('height')), | |
1520 | 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), | |
1521 | 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), | |
1522 | 'fps': int_or_none(representation_attrib.get('frameRate')), | |
1523 | 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), | |
1524 | 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), | |
d577c796 | 1525 | 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, |
1bac3455 | 1526 | 'format_note': 'DASH %s' % content_type, |
51e9094f | 1527 | 'filesize': filesize, |
1bac3455 | 1528 | } |
1529 | representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) | |
1530 | if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: | |
1531 | if 'total_number' not in representation_ms_info and 'segment_duration': | |
6a3828fd | 1532 | segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) |
1533 | representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) | |
1bac3455 | 1534 | media_template = representation_ms_info['media_template'] |
1535 | media_template = media_template.replace('$RepresentationID$', representation_id) | |
db8ee7ec | 1536 | media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) |
fb38aa8b | 1537 | media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) |
1bac3455 | 1538 | media_template.replace('$$', '$') |
b507cc92 S |
1539 | representation_ms_info['segment_urls'] = [ |
1540 | media_template % { | |
1541 | 'Number': segment_number, | |
1542 | 'Bandwidth': representation_attrib.get('bandwidth')} | |
1543 | for segment_number in range( | |
1544 | representation_ms_info['start_number'], | |
1545 | representation_ms_info['total_number'] + representation_ms_info['start_number'])] | |
1bac3455 | 1546 | if 'segment_urls' in representation_ms_info: |
1547 | f.update({ | |
1548 | 'segment_urls': representation_ms_info['segment_urls'], | |
1549 | 'protocol': 'http_dash_segments', | |
df374b52 | 1550 | }) |
1bac3455 | 1551 | if 'initialization_url' in representation_ms_info: |
1552 | initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) | |
1553 | f.update({ | |
1554 | 'initialization_url': initialization_url, | |
1555 | }) | |
1556 | if not f.get('url'): | |
1557 | f['url'] = initialization_url | |
1558 | try: | |
1559 | existing_format = next( | |
1560 | fo for fo in formats | |
1561 | if fo['format_id'] == representation_id) | |
1562 | except StopIteration: | |
1563 | full_info = formats_dict.get(representation_id, {}).copy() | |
1564 | full_info.update(f) | |
1565 | formats.append(full_info) | |
1566 | else: | |
1567 | existing_format.update(f) | |
17b598d3 | 1568 | else: |
1bac3455 | 1569 | self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) |
17b598d3 YCH |
1570 | return formats |
1571 | ||
f4b1c7ad PH |
1572 | def _live_title(self, name): |
1573 | """ Generate the title for a live video """ | |
1574 | now = datetime.datetime.now() | |
611c1dd9 | 1575 | now_str = now.strftime('%Y-%m-%d %H:%M') |
f4b1c7ad PH |
1576 | return name + ' ' + now_str |
1577 | ||
b14f3a4c PH |
1578 | def _int(self, v, name, fatal=False, **kwargs): |
1579 | res = int_or_none(v, **kwargs) | |
1580 | if 'get_attr' in kwargs: | |
1581 | print(getattr(v, kwargs['get_attr'])) | |
1582 | if res is None: | |
1583 | msg = 'Failed to extract %s: Could not parse value %r' % (name, v) | |
1584 | if fatal: | |
1585 | raise ExtractorError(msg) | |
1586 | else: | |
1587 | self._downloader.report_warning(msg) | |
1588 | return res | |
1589 | ||
1590 | def _float(self, v, name, fatal=False, **kwargs): | |
1591 | res = float_or_none(v, **kwargs) | |
1592 | if res is None: | |
1593 | msg = 'Failed to extract %s: Could not parse value %r' % (name, v) | |
1594 | if fatal: | |
1595 | raise ExtractorError(msg) | |
1596 | else: | |
1597 | self._downloader.report_warning(msg) | |
1598 | return res | |
1599 | ||
42939b61 | 1600 | def _set_cookie(self, domain, name, value, expire_time=None): |
810fb84d PH |
1601 | cookie = compat_cookiejar.Cookie( |
1602 | 0, name, value, None, None, domain, None, | |
42939b61 JMF |
1603 | None, '/', True, False, expire_time, '', None, None, None) |
1604 | self._downloader.cookiejar.set_cookie(cookie) | |
1605 | ||
799207e8 | 1606 | def _get_cookies(self, url): |
1607 | """ Return a compat_cookies.SimpleCookie with the cookies for the url """ | |
5c2266df | 1608 | req = sanitized_Request(url) |
799207e8 | 1609 | self._downloader.cookiejar.add_cookie_header(req) |
1610 | return compat_cookies.SimpleCookie(req.get_header('Cookie')) | |
1611 | ||
05900629 PH |
1612 | def get_testcases(self, include_onlymatching=False): |
1613 | t = getattr(self, '_TEST', None) | |
1614 | if t: | |
1615 | assert not hasattr(self, '_TESTS'), \ | |
1616 | '%s has _TEST and _TESTS' % type(self).__name__ | |
1617 | tests = [t] | |
1618 | else: | |
1619 | tests = getattr(self, '_TESTS', []) | |
1620 | for t in tests: | |
1621 | if not include_onlymatching and t.get('only_matching', False): | |
1622 | continue | |
1623 | t['name'] = type(self).__name__[:-len('IE')] | |
1624 | yield t | |
1625 | ||
1626 | def is_suitable(self, age_limit): | |
1627 | """ Test whether the extractor is generally suitable for the given | |
1628 | age limit (i.e. pornographic sites are not, all others usually are) """ | |
1629 | ||
1630 | any_restricted = False | |
1631 | for tc in self.get_testcases(include_onlymatching=False): | |
1632 | if 'playlist' in tc: | |
1633 | tc = tc['playlist'][0] | |
1634 | is_restricted = age_restricted( | |
1635 | tc.get('info_dict', {}).get('age_limit'), age_limit) | |
1636 | if not is_restricted: | |
1637 | return True | |
1638 | any_restricted = any_restricted or is_restricted | |
1639 | return not any_restricted | |
1640 | ||
a504ced0 | 1641 | def extract_subtitles(self, *args, **kwargs): |
9868ea49 JMF |
1642 | if (self._downloader.params.get('writesubtitles', False) or |
1643 | self._downloader.params.get('listsubtitles')): | |
1644 | return self._get_subtitles(*args, **kwargs) | |
1645 | return {} | |
a504ced0 JMF |
1646 | |
1647 | def _get_subtitles(self, *args, **kwargs): | |
611c1dd9 | 1648 | raise NotImplementedError('This method must be implemented by subclasses') |
a504ced0 | 1649 | |
912e0b7e YCH |
1650 | @staticmethod |
1651 | def _merge_subtitle_items(subtitle_list1, subtitle_list2): | |
1652 | """ Merge subtitle items for one language. Items with duplicated URLs | |
1653 | will be dropped. """ | |
1654 | list1_urls = set([item['url'] for item in subtitle_list1]) | |
1655 | ret = list(subtitle_list1) | |
1656 | ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) | |
1657 | return ret | |
1658 | ||
1659 | @classmethod | |
8c97f819 | 1660 | def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): |
912e0b7e | 1661 | """ Merge two subtitle dictionaries, language by language. """ |
912e0b7e YCH |
1662 | ret = dict(subtitle_dict1) |
1663 | for lang in subtitle_dict2: | |
8c97f819 | 1664 | ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) |
912e0b7e YCH |
1665 | return ret |
1666 | ||
360e1ca5 | 1667 | def extract_automatic_captions(self, *args, **kwargs): |
9868ea49 JMF |
1668 | if (self._downloader.params.get('writeautomaticsub', False) or |
1669 | self._downloader.params.get('listsubtitles')): | |
1670 | return self._get_automatic_captions(*args, **kwargs) | |
1671 | return {} | |
360e1ca5 JMF |
1672 | |
1673 | def _get_automatic_captions(self, *args, **kwargs): | |
611c1dd9 | 1674 | raise NotImplementedError('This method must be implemented by subclasses') |
360e1ca5 | 1675 | |
d77ab8e2 S |
1676 | def mark_watched(self, *args, **kwargs): |
1677 | if (self._downloader.params.get('mark_watched', False) and | |
1678 | (self._get_login_info()[0] is not None or | |
1679 | self._downloader.params.get('cookiefile') is not None)): | |
1680 | self._mark_watched(*args, **kwargs) | |
1681 | ||
1682 | def _mark_watched(self, *args, **kwargs): | |
1683 | raise NotImplementedError('This method must be implemented by subclasses') | |
1684 | ||
8dbe9899 | 1685 | |
d6983cb4 PH |
1686 | class SearchInfoExtractor(InfoExtractor): |
1687 | """ | |
1688 | Base class for paged search queries extractors. | |
10952eb2 | 1689 | They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} |
d6983cb4 PH |
1690 | Instances should define _SEARCH_KEY and _MAX_RESULTS. |
1691 | """ | |
1692 | ||
1693 | @classmethod | |
1694 | def _make_valid_url(cls): | |
1695 | return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY | |
1696 | ||
1697 | @classmethod | |
1698 | def suitable(cls, url): | |
1699 | return re.match(cls._make_valid_url(), url) is not None | |
1700 | ||
1701 | def _real_extract(self, query): | |
1702 | mobj = re.match(self._make_valid_url(), query) | |
1703 | if mobj is None: | |
f1a9d64e | 1704 | raise ExtractorError('Invalid search query "%s"' % query) |
d6983cb4 PH |
1705 | |
1706 | prefix = mobj.group('prefix') | |
1707 | query = mobj.group('query') | |
1708 | if prefix == '': | |
1709 | return self._get_n_results(query, 1) | |
1710 | elif prefix == 'all': | |
1711 | return self._get_n_results(query, self._MAX_RESULTS) | |
1712 | else: | |
1713 | n = int(prefix) | |
1714 | if n <= 0: | |
f1a9d64e | 1715 | raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) |
d6983cb4 | 1716 | elif n > self._MAX_RESULTS: |
f1a9d64e | 1717 | self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) |
d6983cb4 PH |
1718 | n = self._MAX_RESULTS |
1719 | return self._get_n_results(query, n) | |
1720 | ||
1721 | def _get_n_results(self, query, n): | |
1722 | """Get a specified number of results for a query""" | |
611c1dd9 | 1723 | raise NotImplementedError('This method must be implemented by subclasses') |
0f818663 PH |
1724 | |
1725 | @property | |
1726 | def SEARCH_KEY(self): | |
1727 | return self._SEARCH_KEY |