]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[youtube] Skip download for multiple v= test
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
bf42a990
S
65NO_DEFAULT = object()
66
7105440c
YCH
67ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
70
71
d77c3dfd 72def preferredencoding():
59ae15a5 73 """Get preferred encoding.
d77c3dfd 74
59ae15a5
PH
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
78 try:
79 pref = locale.getpreferredencoding()
28e614de 80 'TEST'.encode(pref)
70a1165b 81 except Exception:
59ae15a5 82 pref = 'UTF-8'
bae611f2 83
59ae15a5 84 return pref
d77c3dfd 85
f4bfd65f 86
181c8655 87def write_json_file(obj, fn):
1394646a 88 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 89
92120217 90 fn = encodeFilename(fn)
61ee5aeb 91 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
99 else:
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
102
73159f99
S
103 args = {
104 'suffix': '.tmp',
ec5f6016
JMF
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
73159f99
S
107 'delete': False,
108 }
109
181c8655
PH
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
73159f99 113 args['mode'] = 'wb'
181c8655 114 else:
73159f99
S
115 args.update({
116 'mode': 'w',
117 'encoding': 'utf-8',
118 })
119
c86b6142 120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
121
122 try:
123 with tf:
124 json.dump(obj, tf)
1394646a
IK
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
128 try:
129 os.unlink(fn)
130 except OSError:
131 pass
181c8655 132 os.rename(tf.name, fn)
70a1165b 133 except Exception:
181c8655
PH
134 try:
135 os.remove(tf.name)
136 except OSError:
137 pass
138 raise
139
140
141if sys.version_info >= (2, 7):
ee114368 142 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 143 """ Find the xpath xpath[@key=val] """
cbf915f3 144 assert re.match(r'^[a-zA-Z-]+$', key)
ee114368
S
145 if val:
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
148 return node.find(expr)
149else:
ee114368 150 def find_xpath_attr(node, xpath, key, val=None):
4eefbfdb
PH
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
8f9312c3 153 if isinstance(xpath, compat_str):
4eefbfdb
PH
154 xpath = xpath.encode('ascii')
155
59ae56fa 156 for f in node.findall(xpath):
ee114368
S
157 if key not in f.attrib:
158 continue
159 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
160 return f
161 return None
162
d7e66d39
JMF
163# On python2.6 the xml.etree.ElementTree.Element methods don't support
164# the namespace parameter
5f6a1245
JW
165
166
d7e66d39
JMF
167def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
169 replaced = []
170 for c in components:
171 if len(c) == 1:
172 replaced.append(c[0])
173 else:
174 ns, tag = c
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
177
d77c3dfd 178
bf42a990 179def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
d74bebd5
PH
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
182
bf0ff932 183 n = node.find(xpath)
42bdd9d0 184 if n is None or n.text is None:
bf42a990
S
185 if default is not NO_DEFAULT:
186 return default
187 elif fatal:
bf0ff932
PH
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
190 else:
191 return None
192 return n.text
193
194
9e6dd238 195def get_element_by_id(id, html):
43e8fafd
ND
196 """Return the content of the tag with the specified ID in the passed HTML document"""
197 return get_element_by_attribute("id", id, html)
198
12ea2f30 199
43e8fafd
ND
200def get_element_by_attribute(attribute, value, html):
201 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 202
38285056
PH
203 m = re.search(r'''(?xs)
204 <([a-zA-Z0-9:._-]+)
205 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
206 \s+%s=['"]?%s['"]?
207 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
208 \s*>
209 (?P<content>.*?)
210 </\1>
211 ''' % (re.escape(attribute), re.escape(value)), html)
212
213 if not m:
214 return None
215 res = m.group('content')
216
217 if res.startswith('"') or res.startswith("'"):
218 res = res[1:-1]
a921f407 219
38285056 220 return unescapeHTML(res)
a921f407 221
9e6dd238
FV
222
223def clean_html(html):
59ae15a5 224 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
225
226 if html is None: # Convenience for sanitizing descriptions etc.
227 return html
228
59ae15a5
PH
229 # Newline vs <br />
230 html = html.replace('\n', ' ')
6b3aef80
FV
231 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
232 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
233 # Strip html tags
234 html = re.sub('<.*?>', '', html)
235 # Replace html entities
236 html = unescapeHTML(html)
7decf895 237 return html.strip()
9e6dd238
FV
238
239
d77c3dfd 240def sanitize_open(filename, open_mode):
59ae15a5
PH
241 """Try to open the given filename, and slightly tweak it if this fails.
242
243 Attempts to open the given filename. If this fails, it tries to change
244 the filename slightly, step by step, until it's either able to open it
245 or it fails and raises a final exception, like the standard open()
246 function.
247
248 It returns the tuple (stream, definitive_file_name).
249 """
250 try:
28e614de 251 if filename == '-':
59ae15a5
PH
252 if sys.platform == 'win32':
253 import msvcrt
254 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 255 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
256 stream = open(encodeFilename(filename), open_mode)
257 return (stream, filename)
258 except (IOError, OSError) as err:
f45c185f
PH
259 if err.errno in (errno.EACCES,):
260 raise
59ae15a5 261
f45c185f 262 # In case of error, try to remove win32 forbidden chars
d55de57b 263 alt_filename = sanitize_path(filename)
f45c185f
PH
264 if alt_filename == filename:
265 raise
266 else:
267 # An exception here should be caught in the caller
d55de57b 268 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 269 return (stream, alt_filename)
d77c3dfd
FV
270
271
272def timeconvert(timestr):
59ae15a5
PH
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
1c469a94 279
5f6a1245 280
796173d0 281def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
282 """Sanitizes a string so it could be used as part of a filename.
283 If restricted is set, use a stricter subset of allowed characters.
796173d0 284 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
285 """
286 def replace_insane(char):
287 if char == '?' or ord(char) < 32 or ord(char) == 127:
288 return ''
289 elif char == '"':
290 return '' if restricted else '\''
291 elif char == ':':
292 return '_-' if restricted else ' -'
293 elif char in '\\/|*<>':
294 return '_'
627dcfff 295 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
296 return '_'
297 if restricted and ord(char) > 127:
298 return '_'
299 return char
300
2aeb06d6
PH
301 # Handle timestamps
302 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 303 result = ''.join(map(replace_insane, s))
796173d0
PH
304 if not is_id:
305 while '__' in result:
306 result = result.replace('__', '_')
307 result = result.strip('_')
308 # Common case of "Foreign band name - English song title"
309 if restricted and result.startswith('-_'):
310 result = result[2:]
5a42414b
PH
311 if result.startswith('-'):
312 result = '_' + result[len('-'):]
a7440261 313 result = result.lstrip('.')
796173d0
PH
314 if not result:
315 result = '_'
59ae15a5 316 return result
d77c3dfd 317
5f6a1245 318
a2aaf4db
S
319def sanitize_path(s):
320 """Sanitizes and normalizes path on Windows"""
321 if sys.platform != 'win32':
322 return s
be531ef1
S
323 drive_or_unc, _ = os.path.splitdrive(s)
324 if sys.version_info < (2, 7) and not drive_or_unc:
325 drive_or_unc, _ = os.path.splitunc(s)
326 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
327 if drive_or_unc:
a2aaf4db
S
328 norm_path.pop(0)
329 sanitized_path = [
2ebfeaca 330 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 331 for path_part in norm_path]
be531ef1
S
332 if drive_or_unc:
333 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
334 return os.path.join(*sanitized_path)
335
336
d77c3dfd 337def orderedSet(iterable):
59ae15a5
PH
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
d77c3dfd 344
912b38b4 345
4e408e47
PH
346def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
91757b0f 352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
353 if mobj is not None:
354 numstr = mobj.group(1)
28e614de 355 if numstr.startswith('x'):
4e408e47 356 base = 16
28e614de 357 numstr = '0%s' % numstr
4e408e47
PH
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
28e614de 363 return ('&%s;' % entity)
4e408e47
PH
364
365
d77c3dfd 366def unescapeHTML(s):
912b38b4
PH
367 if s is None:
368 return None
369 assert type(s) == compat_str
d77c3dfd 370
4e408e47
PH
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 373
8bf48f23 374
aa49acd1
S
375def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
8bf48f23 387def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
388 """
389 @param s The name of the file
390 """
d77c3dfd 391
8bf48f23 392 assert type(s) == compat_str
d77c3dfd 393
59ae15a5
PH
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
0f00efed 397
aa49acd1
S
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 416
f07b74fc
PH
417
418def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
7af808a5 422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
aa49acd1
S
427def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
8271226a
PH
431def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
1c256f70 439
5f6a1245 440
4539dd30
PH
441def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
a0ddb8a2 449
be4a824d
PH
450def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 454 if opts_no_check_certificate:
be5f2c19 455 context.check_hostname = False
0db261ba 456 context.verify_mode = ssl.CERT_NONE
a2366922 457 try:
be4a824d 458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
d7932313 465 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 466 else: # Python < 3.4
d7932313 467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 468 context.verify_mode = (ssl.CERT_NONE
dca08720 469 if opts_no_check_certificate
ea6d901e 470 else ssl.CERT_REQUIRED)
303b479e 471 context.set_default_verify_paths()
be4a824d 472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 473
732ea2f0 474
08f2a92c
JMF
475def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
1c256f70
PH
486class ExtractorError(Exception):
487 """Error during info extraction."""
5f6a1245 488
d11271dd 489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
d11271dd
PH
496 if video_id is not None:
497 msg = video_id + ': ' + msg
410f3e73 498 if cause:
28e614de 499 msg += ' (caused by %r)' % cause
9a82b238 500 if not expected:
08f2a92c 501 msg += bug_reports_message()
1c256f70 502 super(ExtractorError, self).__init__(msg)
d5979c5d 503
1c256f70 504 self.traceback = tb
8cc83b8d 505 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 506 self.cause = cause
d11271dd 507 self.video_id = video_id
1c256f70 508
01951dda
PH
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
28e614de 512 return ''.join(traceback.format_tb(self.traceback))
01951dda 513
1c256f70 514
416c7fcb
PH
515class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
55b3e45b
JMF
522class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
d77c3dfd 527class DownloadError(Exception):
59ae15a5 528 """Download Error exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
5f6a1245 534
8cc83b8d
FV
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
d77c3dfd
FV
539
540
541class SameFileError(Exception):
59ae15a5 542 """Same File exception.
d77c3dfd 543
59ae15a5
PH
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
d77c3dfd
FV
548
549
550class PostProcessingError(Exception):
59ae15a5 551 """Post Processing exception.
d77c3dfd 552
59ae15a5
PH
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
5f6a1245 556
7851b379
PH
557 def __init__(self, msg):
558 self.msg = msg
d77c3dfd 559
5f6a1245 560
d77c3dfd 561class MaxDownloadsReached(Exception):
59ae15a5
PH
562 """ --max-downloads limit has been reached. """
563 pass
d77c3dfd
FV
564
565
566class UnavailableVideoError(Exception):
59ae15a5 567 """Unavailable Format exception.
d77c3dfd 568
59ae15a5
PH
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
d77c3dfd
FV
573
574
575class ContentTooShortError(Exception):
59ae15a5 576 """Content Too Short exception.
d77c3dfd 577
59ae15a5
PH
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
d77c3dfd 582
59ae15a5 583 def __init__(self, downloaded, expected):
2c7ed247 584 # Both in bytes
59ae15a5
PH
585 self.downloaded = downloaded
586 self.expected = expected
d77c3dfd 587
5f6a1245 588
c5a59d93 589def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
590 hc = http_class(*args, **kwargs)
591 source_address = ydl_handler._params.get('source_address')
592 if source_address is not None:
593 sa = (source_address, 0)
594 if hasattr(hc, 'source_address'): # Python 2.7+
595 hc.source_address = sa
596 else: # Python 2.6
597 def _hc_connect(self, *args, **kwargs):
598 sock = compat_socket_create_connection(
599 (self.host, self.port), self.timeout, sa)
600 if is_https:
d7932313
PH
601 self.sock = ssl.wrap_socket(
602 sock, self.key_file, self.cert_file,
603 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
604 else:
605 self.sock = sock
606 hc.connect = functools.partial(_hc_connect, hc)
607
608 return hc
609
610
acebc9cd 611class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
612 """Handler for HTTP requests and responses.
613
614 This class, when installed with an OpenerDirector, automatically adds
615 the standard headers to every HTTP request and handles gzipped and
616 deflated responses from web servers. If compression is to be avoided in
617 a particular request, the original request in the program code only has
618 to include the HTTP header "Youtubedl-No-Compression", which will be
619 removed before making the real request.
620
621 Part of this code was copied from:
622
623 http://techknack.net/python-urllib2-handlers/
624
625 Andrew Rowls, the author of that code, agreed to release it to the
626 public domain.
627 """
628
be4a824d
PH
629 def __init__(self, params, *args, **kwargs):
630 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
631 self._params = params
632
633 def http_open(self, req):
634 return self.do_open(functools.partial(
c5a59d93 635 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
636 req)
637
59ae15a5
PH
638 @staticmethod
639 def deflate(data):
640 try:
641 return zlib.decompress(data, -zlib.MAX_WBITS)
642 except zlib.error:
643 return zlib.decompress(data)
644
645 @staticmethod
646 def addinfourl_wrapper(stream, headers, url, code):
647 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
648 return compat_urllib_request.addinfourl(stream, headers, url, code)
649 ret = compat_urllib_request.addinfourl(stream, headers, url)
650 ret.code = code
651 return ret
652
acebc9cd 653 def http_request(self, req):
51f267d9
S
654 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
655 # always respected by websites, some tend to give out URLs with non percent-encoded
656 # non-ASCII characters (see telemb.py, ard.py [#3412])
657 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
658 # To work around aforementioned issue we will replace request's original URL with
659 # percent-encoded one
660 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
661 # the code of this workaround has been moved here from YoutubeDL.urlopen()
662 url = req.get_full_url()
663 url_escaped = escape_url(url)
664
665 # Substitute URL if any change after escaping
666 if url != url_escaped:
667 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
668 new_req = req_type(
669 url_escaped, data=req.data, headers=req.headers,
670 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
671 new_req.timeout = req.timeout
672 req = new_req
673
33ac271b 674 for h, v in std_headers.items():
3d5f7a39
JK
675 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
676 # The dict keys are capitalized because of this bug by urllib
677 if h.capitalize() not in req.headers:
33ac271b 678 req.add_header(h, v)
59ae15a5
PH
679 if 'Youtubedl-no-compression' in req.headers:
680 if 'Accept-encoding' in req.headers:
681 del req.headers['Accept-encoding']
682 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
683
684 if sys.version_info < (2, 7) and '#' in req.get_full_url():
685 # Python 2.6 is brain-dead when it comes to fragments
686 req._Request__original = req._Request__original.partition('#')[0]
687 req._Request__r_type = req._Request__r_type.partition('#')[0]
688
59ae15a5
PH
689 return req
690
acebc9cd 691 def http_response(self, req, resp):
59ae15a5
PH
692 old_resp = resp
693 # gzip
694 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
695 content = resp.read()
696 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
697 try:
698 uncompressed = io.BytesIO(gz.read())
699 except IOError as original_ioerror:
700 # There may be junk add the end of the file
701 # See http://stackoverflow.com/q/4928560/35070 for details
702 for i in range(1, 1024):
703 try:
704 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
705 uncompressed = io.BytesIO(gz.read())
706 except IOError:
707 continue
708 break
709 else:
710 raise original_ioerror
711 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
712 resp.msg = old_resp.msg
713 # deflate
714 if resp.headers.get('Content-encoding', '') == 'deflate':
715 gz = io.BytesIO(self.deflate(resp.read()))
716 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
717 resp.msg = old_resp.msg
5a4d9ddb
S
718 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
719 if 300 <= resp.code < 400:
720 location = resp.headers.get('Location')
721 if location:
722 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
723 if sys.version_info >= (3, 0):
724 location = location.encode('iso-8859-1').decode('utf-8')
725 location_escaped = escape_url(location)
726 if location != location_escaped:
727 del resp.headers['Location']
728 resp.headers['Location'] = location_escaped
59ae15a5 729 return resp
0f8d03f8 730
acebc9cd
PH
731 https_request = http_request
732 https_response = http_response
bf50b038 733
5de90176 734
be4a824d
PH
735class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
736 def __init__(self, params, https_conn_class=None, *args, **kwargs):
737 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
738 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
739 self._params = params
740
741 def https_open(self, req):
4f264c02
JMF
742 kwargs = {}
743 if hasattr(self, '_context'): # python > 2.6
744 kwargs['context'] = self._context
745 if hasattr(self, '_check_hostname'): # python 3.x
746 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
747 return self.do_open(functools.partial(
748 _create_http_connection, self, self._https_conn_class, True),
4f264c02 749 req, **kwargs)
be4a824d
PH
750
751
08b38d54 752def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
753 """ Return a UNIX timestamp from the given date """
754
755 if date_str is None:
756 return None
757
08b38d54
PH
758 if timezone is None:
759 m = re.search(
760 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
761 date_str)
762 if not m:
912b38b4
PH
763 timezone = datetime.timedelta()
764 else:
08b38d54
PH
765 date_str = date_str[:-len(m.group(0))]
766 if not m.group('sign'):
767 timezone = datetime.timedelta()
768 else:
769 sign = 1 if m.group('sign') == '+' else -1
770 timezone = datetime.timedelta(
771 hours=sign * int(m.group('hours')),
772 minutes=sign * int(m.group('minutes')))
6ad4013d 773 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 774 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
775 return calendar.timegm(dt.timetuple())
776
777
42bdd9d0 778def unified_strdate(date_str, day_first=True):
bf50b038 779 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
780
781 if date_str is None:
782 return None
bf50b038 783 upload_date = None
5f6a1245 784 # Replace commas
026fcc04 785 date_str = date_str.replace(',', ' ')
bf50b038 786 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
787 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
788 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 789 # Remove AM/PM + timezone
9bb8e0a3 790 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 791
19e1d359
JMF
792 format_expressions = [
793 '%d %B %Y',
0f99566c 794 '%d %b %Y',
19e1d359
JMF
795 '%B %d %Y',
796 '%b %d %Y',
78ff59d0
PP
797 '%b %dst %Y %I:%M%p',
798 '%b %dnd %Y %I:%M%p',
799 '%b %dth %Y %I:%M%p',
a69801e2 800 '%Y %m %d',
19e1d359 801 '%Y-%m-%d',
fe556f1b 802 '%Y/%m/%d',
19e1d359 803 '%Y/%m/%d %H:%M:%S',
5d73273f 804 '%Y-%m-%d %H:%M:%S',
e9be9a6a 805 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 806 '%d.%m.%Y %H:%M',
b047de6f 807 '%d.%m.%Y %H.%M',
19e1d359 808 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
809 '%Y-%m-%dT%H:%M:%S.%fZ',
810 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 811 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 812 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 813 '%Y-%m-%dT%H:%M',
19e1d359 814 ]
42bdd9d0
PH
815 if day_first:
816 format_expressions.extend([
79c21abb 817 '%d-%m-%Y',
776dc399
S
818 '%d.%m.%Y',
819 '%d/%m/%Y',
820 '%d/%m/%y',
42bdd9d0
PH
821 '%d/%m/%Y %H:%M:%S',
822 ])
823 else:
824 format_expressions.extend([
79c21abb 825 '%m-%d-%Y',
776dc399
S
826 '%m.%d.%Y',
827 '%m/%d/%Y',
828 '%m/%d/%y',
42bdd9d0
PH
829 '%m/%d/%Y %H:%M:%S',
830 ])
bf50b038
JMF
831 for expression in format_expressions:
832 try:
833 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 834 except ValueError:
bf50b038 835 pass
42393ce2
PH
836 if upload_date is None:
837 timetuple = email.utils.parsedate_tz(date_str)
838 if timetuple:
839 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
840 return upload_date
841
5f6a1245 842
28e614de 843def determine_ext(url, default_ext='unknown_video'):
f4776371
S
844 if url is None:
845 return default_ext
28e614de 846 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
847 if re.match(r'^[A-Za-z0-9]+$', guess):
848 return guess
849 else:
cbdbb766 850 return default_ext
73e79f2a 851
5f6a1245 852
d4051a8e 853def subtitles_filename(filename, sub_lang, sub_format):
28e614de 854 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 855
5f6a1245 856
bd558525 857def date_from_str(date_str):
37254abc
JMF
858 """
859 Return a datetime object from a string in the format YYYYMMDD or
860 (now|today)[+-][0-9](day|week|month|year)(s)?"""
861 today = datetime.date.today()
f8795e10 862 if date_str in ('now', 'today'):
37254abc 863 return today
f8795e10
PH
864 if date_str == 'yesterday':
865 return today - datetime.timedelta(days=1)
37254abc
JMF
866 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
867 if match is not None:
868 sign = match.group('sign')
869 time = int(match.group('time'))
870 if sign == '-':
871 time = -time
872 unit = match.group('unit')
5f6a1245 873 # A bad aproximation?
37254abc
JMF
874 if unit == 'month':
875 unit = 'day'
876 time *= 30
877 elif unit == 'year':
878 unit = 'day'
879 time *= 365
880 unit += 's'
881 delta = datetime.timedelta(**{unit: time})
882 return today + delta
bd558525 883 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
884
885
e63fc1be 886def hyphenate_date(date_str):
887 """
888 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
889 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
890 if match is not None:
891 return '-'.join(match.groups())
892 else:
893 return date_str
894
5f6a1245 895
bd558525
JMF
896class DateRange(object):
897 """Represents a time interval between two dates"""
5f6a1245 898
bd558525
JMF
899 def __init__(self, start=None, end=None):
900 """start and end must be strings in the format accepted by date"""
901 if start is not None:
902 self.start = date_from_str(start)
903 else:
904 self.start = datetime.datetime.min.date()
905 if end is not None:
906 self.end = date_from_str(end)
907 else:
908 self.end = datetime.datetime.max.date()
37254abc 909 if self.start > self.end:
bd558525 910 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 911
bd558525
JMF
912 @classmethod
913 def day(cls, day):
914 """Returns a range that only contains the given day"""
5f6a1245
JW
915 return cls(day, day)
916
bd558525
JMF
917 def __contains__(self, date):
918 """Check if the date is in the range"""
37254abc
JMF
919 if not isinstance(date, datetime.date):
920 date = date_from_str(date)
921 return self.start <= date <= self.end
5f6a1245 922
bd558525 923 def __str__(self):
5f6a1245 924 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
925
926
927def platform_name():
928 """ Returns the platform name as a compat_str """
929 res = platform.platform()
930 if isinstance(res, bytes):
931 res = res.decode(preferredencoding())
932
933 assert isinstance(res, compat_str)
934 return res
c257baff
PH
935
936
b58ddb32
PH
937def _windows_write_string(s, out):
938 """ Returns True if the string was written using special methods,
939 False if it has yet to be written out."""
940 # Adapted from http://stackoverflow.com/a/3259271/35070
941
942 import ctypes
943 import ctypes.wintypes
944
945 WIN_OUTPUT_IDS = {
946 1: -11,
947 2: -12,
948 }
949
a383a98a
PH
950 try:
951 fileno = out.fileno()
952 except AttributeError:
953 # If the output stream doesn't have a fileno, it's virtual
954 return False
aa42e873
PH
955 except io.UnsupportedOperation:
956 # Some strange Windows pseudo files?
957 return False
b58ddb32
PH
958 if fileno not in WIN_OUTPUT_IDS:
959 return False
960
e2f89ec7 961 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 962 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 963 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
964 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
965
e2f89ec7 966 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
967 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
968 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 969 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
970 written = ctypes.wintypes.DWORD(0)
971
6ac4e806 972 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
973 FILE_TYPE_CHAR = 0x0002
974 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 975 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
976 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
977 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 978 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
979 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
980
981 def not_a_console(handle):
982 if handle == INVALID_HANDLE_VALUE or handle is None:
983 return True
8fb3ac36
PH
984 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
985 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
986
987 if not_a_console(h):
988 return False
989
d1b9c912
PH
990 def next_nonbmp_pos(s):
991 try:
992 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
993 except StopIteration:
994 return len(s)
995
996 while s:
997 count = min(next_nonbmp_pos(s), 1024)
998
b58ddb32 999 ret = WriteConsoleW(
d1b9c912 1000 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1001 if ret == 0:
1002 raise OSError('Failed to write string')
d1b9c912
PH
1003 if not count: # We just wrote a non-BMP character
1004 assert written.value == 2
1005 s = s[1:]
1006 else:
1007 assert written.value > 0
1008 s = s[written.value:]
b58ddb32
PH
1009 return True
1010
1011
734f90bb 1012def write_string(s, out=None, encoding=None):
7459e3a2
PH
1013 if out is None:
1014 out = sys.stderr
8bf48f23 1015 assert type(s) == compat_str
7459e3a2 1016
b58ddb32
PH
1017 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1018 if _windows_write_string(s, out):
1019 return
1020
7459e3a2
PH
1021 if ('b' in getattr(out, 'mode', '') or
1022 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1023 byt = s.encode(encoding or preferredencoding(), 'ignore')
1024 out.write(byt)
1025 elif hasattr(out, 'buffer'):
1026 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1027 byt = s.encode(enc, 'ignore')
1028 out.buffer.write(byt)
1029 else:
8bf48f23 1030 out.write(s)
7459e3a2
PH
1031 out.flush()
1032
1033
48ea9cea
PH
1034def bytes_to_intlist(bs):
1035 if not bs:
1036 return []
1037 if isinstance(bs[0], int): # Python 3
1038 return list(bs)
1039 else:
1040 return [ord(c) for c in bs]
1041
c257baff 1042
cba892fa 1043def intlist_to_bytes(xs):
1044 if not xs:
1045 return b''
eb4157fd 1046 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1047
1048
c1c9a79c
PH
1049# Cross-platform file locking
1050if sys.platform == 'win32':
1051 import ctypes.wintypes
1052 import msvcrt
1053
1054 class OVERLAPPED(ctypes.Structure):
1055 _fields_ = [
1056 ('Internal', ctypes.wintypes.LPVOID),
1057 ('InternalHigh', ctypes.wintypes.LPVOID),
1058 ('Offset', ctypes.wintypes.DWORD),
1059 ('OffsetHigh', ctypes.wintypes.DWORD),
1060 ('hEvent', ctypes.wintypes.HANDLE),
1061 ]
1062
1063 kernel32 = ctypes.windll.kernel32
1064 LockFileEx = kernel32.LockFileEx
1065 LockFileEx.argtypes = [
1066 ctypes.wintypes.HANDLE, # hFile
1067 ctypes.wintypes.DWORD, # dwFlags
1068 ctypes.wintypes.DWORD, # dwReserved
1069 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1070 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1071 ctypes.POINTER(OVERLAPPED) # Overlapped
1072 ]
1073 LockFileEx.restype = ctypes.wintypes.BOOL
1074 UnlockFileEx = kernel32.UnlockFileEx
1075 UnlockFileEx.argtypes = [
1076 ctypes.wintypes.HANDLE, # hFile
1077 ctypes.wintypes.DWORD, # dwReserved
1078 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1079 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1080 ctypes.POINTER(OVERLAPPED) # Overlapped
1081 ]
1082 UnlockFileEx.restype = ctypes.wintypes.BOOL
1083 whole_low = 0xffffffff
1084 whole_high = 0x7fffffff
1085
1086 def _lock_file(f, exclusive):
1087 overlapped = OVERLAPPED()
1088 overlapped.Offset = 0
1089 overlapped.OffsetHigh = 0
1090 overlapped.hEvent = 0
1091 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1092 handle = msvcrt.get_osfhandle(f.fileno())
1093 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1094 whole_low, whole_high, f._lock_file_overlapped_p):
1095 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1096
1097 def _unlock_file(f):
1098 assert f._lock_file_overlapped_p
1099 handle = msvcrt.get_osfhandle(f.fileno())
1100 if not UnlockFileEx(handle, 0,
1101 whole_low, whole_high, f._lock_file_overlapped_p):
1102 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1103
1104else:
1105 import fcntl
1106
1107 def _lock_file(f, exclusive):
2582bebe 1108 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1109
1110 def _unlock_file(f):
2582bebe 1111 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1112
1113
1114class locked_file(object):
1115 def __init__(self, filename, mode, encoding=None):
1116 assert mode in ['r', 'a', 'w']
1117 self.f = io.open(filename, mode, encoding=encoding)
1118 self.mode = mode
1119
1120 def __enter__(self):
1121 exclusive = self.mode != 'r'
1122 try:
1123 _lock_file(self.f, exclusive)
1124 except IOError:
1125 self.f.close()
1126 raise
1127 return self
1128
1129 def __exit__(self, etype, value, traceback):
1130 try:
1131 _unlock_file(self.f)
1132 finally:
1133 self.f.close()
1134
1135 def __iter__(self):
1136 return iter(self.f)
1137
1138 def write(self, *args):
1139 return self.f.write(*args)
1140
1141 def read(self, *args):
1142 return self.f.read(*args)
4eb7f1d1
JMF
1143
1144
4644ac55
S
1145def get_filesystem_encoding():
1146 encoding = sys.getfilesystemencoding()
1147 return encoding if encoding is not None else 'utf-8'
1148
1149
4eb7f1d1 1150def shell_quote(args):
a6a173c2 1151 quoted_args = []
4644ac55 1152 encoding = get_filesystem_encoding()
a6a173c2
JMF
1153 for a in args:
1154 if isinstance(a, bytes):
1155 # We may get a filename encoded with 'encodeFilename'
1156 a = a.decode(encoding)
1157 quoted_args.append(pipes.quote(a))
28e614de 1158 return ' '.join(quoted_args)
9d4660ca
PH
1159
1160
1161def smuggle_url(url, data):
1162 """ Pass additional data in a URL for internal use. """
1163
1164 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1165 {'__youtubedl_smuggle': json.dumps(data)})
1166 return url + '#' + sdata
9d4660ca
PH
1167
1168
79f82953 1169def unsmuggle_url(smug_url, default=None):
83e865a3 1170 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1171 return smug_url, default
28e614de
PH
1172 url, _, sdata = smug_url.rpartition('#')
1173 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1174 data = json.loads(jsond)
1175 return url, data
02dbf93f
PH
1176
1177
02dbf93f
PH
1178def format_bytes(bytes):
1179 if bytes is None:
28e614de 1180 return 'N/A'
02dbf93f
PH
1181 if type(bytes) is str:
1182 bytes = float(bytes)
1183 if bytes == 0.0:
1184 exponent = 0
1185 else:
1186 exponent = int(math.log(bytes, 1024.0))
28e614de 1187 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1188 converted = float(bytes) / float(1024 ** exponent)
28e614de 1189 return '%.2f%s' % (converted, suffix)
f53c966a 1190
1c088fa8 1191
be64b5b0
PH
1192def parse_filesize(s):
1193 if s is None:
1194 return None
1195
1196 # The lower-case forms are of course incorrect and inofficial,
1197 # but we support those too
1198 _UNIT_TABLE = {
1199 'B': 1,
1200 'b': 1,
1201 'KiB': 1024,
1202 'KB': 1000,
1203 'kB': 1024,
1204 'Kb': 1000,
1205 'MiB': 1024 ** 2,
1206 'MB': 1000 ** 2,
1207 'mB': 1024 ** 2,
1208 'Mb': 1000 ** 2,
1209 'GiB': 1024 ** 3,
1210 'GB': 1000 ** 3,
1211 'gB': 1024 ** 3,
1212 'Gb': 1000 ** 3,
1213 'TiB': 1024 ** 4,
1214 'TB': 1000 ** 4,
1215 'tB': 1024 ** 4,
1216 'Tb': 1000 ** 4,
1217 'PiB': 1024 ** 5,
1218 'PB': 1000 ** 5,
1219 'pB': 1024 ** 5,
1220 'Pb': 1000 ** 5,
1221 'EiB': 1024 ** 6,
1222 'EB': 1000 ** 6,
1223 'eB': 1024 ** 6,
1224 'Eb': 1000 ** 6,
1225 'ZiB': 1024 ** 7,
1226 'ZB': 1000 ** 7,
1227 'zB': 1024 ** 7,
1228 'Zb': 1000 ** 7,
1229 'YiB': 1024 ** 8,
1230 'YB': 1000 ** 8,
1231 'yB': 1024 ** 8,
1232 'Yb': 1000 ** 8,
1233 }
1234
1235 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1236 m = re.match(
1237 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1238 if not m:
1239 return None
1240
4349c07d
PH
1241 num_str = m.group('num').replace(',', '.')
1242 mult = _UNIT_TABLE[m.group('unit')]
1243 return int(float(num_str) * mult)
be64b5b0
PH
1244
1245
caefb1de
PH
1246def month_by_name(name):
1247 """ Return the number of a month by (locale-independently) English name """
1248
caefb1de 1249 try:
7105440c
YCH
1250 return ENGLISH_MONTH_NAMES.index(name) + 1
1251 except ValueError:
1252 return None
1253
1254
1255def month_by_abbreviation(abbrev):
1256 """ Return the number of a month by (locale-independently) English
1257 abbreviations """
1258
1259 try:
1260 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1261 except ValueError:
1262 return None
18258362
JMF
1263
1264
5aafe895 1265def fix_xml_ampersands(xml_str):
18258362 1266 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1267 return re.sub(
1268 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1269 '&amp;',
5aafe895 1270 xml_str)
e3946f98
PH
1271
1272
1273def setproctitle(title):
8bf48f23 1274 assert isinstance(title, compat_str)
e3946f98
PH
1275 try:
1276 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1277 except OSError:
1278 return
6eefe533
PH
1279 title_bytes = title.encode('utf-8')
1280 buf = ctypes.create_string_buffer(len(title_bytes))
1281 buf.value = title_bytes
e3946f98 1282 try:
6eefe533 1283 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1284 except AttributeError:
1285 return # Strange libc, just skip this
d7dda168
PH
1286
1287
1288def remove_start(s, start):
1289 if s.startswith(start):
1290 return s[len(start):]
1291 return s
29eb5174
PH
1292
1293
2b9faf55
PH
1294def remove_end(s, end):
1295 if s.endswith(end):
1296 return s[:-len(end)]
1297 return s
1298
1299
29eb5174 1300def url_basename(url):
9b8aaeed 1301 path = compat_urlparse.urlparse(url).path
28e614de 1302 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1303
1304
1305class HEADRequest(compat_urllib_request.Request):
1306 def get_method(self):
1307 return "HEAD"
7217e148
PH
1308
1309
9732d77e 1310def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1311 if get_attr:
1312 if v is not None:
1313 v = getattr(v, get_attr, None)
9572013d
PH
1314 if v == '':
1315 v = None
9732d77e
PH
1316 return default if v is None else (int(v) * invscale // scale)
1317
9572013d 1318
40a90862
JMF
1319def str_or_none(v, default=None):
1320 return default if v is None else compat_str(v)
1321
9732d77e
PH
1322
1323def str_to_int(int_str):
48d4681e 1324 """ A more relaxed version of int_or_none """
9732d77e
PH
1325 if int_str is None:
1326 return None
28e614de 1327 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1328 return int(int_str)
608d11f5
PH
1329
1330
9732d77e
PH
1331def float_or_none(v, scale=1, invscale=1, default=None):
1332 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1333
1334
608d11f5 1335def parse_duration(s):
8f9312c3 1336 if not isinstance(s, compat_basestring):
608d11f5
PH
1337 return None
1338
ca7b3246
S
1339 s = s.strip()
1340
608d11f5 1341 m = re.match(
9d22a7df 1342 r'''(?ix)(?:P?T)?
e8df5cee 1343 (?:
9c29bc69 1344 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1345 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1346
9c29bc69 1347 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1348 (?:
8f4b58d7
PH
1349 (?:
1350 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1351 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1352 )?
6a68bb57
PH
1353 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1354 )?
e8df5cee
PH
1355 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1356 )$''', s)
608d11f5
PH
1357 if not m:
1358 return None
e8df5cee
PH
1359 res = 0
1360 if m.group('only_mins'):
1361 return float_or_none(m.group('only_mins'), invscale=60)
1362 if m.group('only_hours'):
1363 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1364 if m.group('secs'):
1365 res += int(m.group('secs'))
3e675fab
PH
1366 if m.group('mins_reversed'):
1367 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1368 if m.group('mins'):
1369 res += int(m.group('mins')) * 60
e8df5cee
PH
1370 if m.group('hours'):
1371 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1372 if m.group('hours_reversed'):
1373 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1374 if m.group('days'):
1375 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1376 if m.group('ms'):
1377 res += float(m.group('ms'))
608d11f5 1378 return res
91d7d0b3
JMF
1379
1380
e65e4c88 1381def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1382 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1383 return (
1384 '{0}.{1}{2}'.format(name, ext, real_ext)
1385 if not expected_real_ext or real_ext[1:] == expected_real_ext
1386 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1387
1388
b3ed15b7
S
1389def replace_extension(filename, ext, expected_real_ext=None):
1390 name, real_ext = os.path.splitext(filename)
1391 return '{0}.{1}'.format(
1392 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1393 ext)
1394
1395
d70ad093
PH
1396def check_executable(exe, args=[]):
1397 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1398 args can be a list of arguments for a short output (like -version) """
1399 try:
1400 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1401 except OSError:
1402 return False
1403 return exe
b7ab0590
PH
1404
1405
95807118 1406def get_exe_version(exe, args=['--version'],
cae97f65 1407 version_re=None, unrecognized='present'):
95807118
PH
1408 """ Returns the version of the specified executable,
1409 or False if the executable is not present """
1410 try:
cae97f65 1411 out, _ = subprocess.Popen(
54116803 1412 [encodeArgument(exe)] + args,
95807118
PH
1413 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1414 except OSError:
1415 return False
cae97f65
PH
1416 if isinstance(out, bytes): # Python 2.x
1417 out = out.decode('ascii', 'ignore')
1418 return detect_exe_version(out, version_re, unrecognized)
1419
1420
1421def detect_exe_version(output, version_re=None, unrecognized='present'):
1422 assert isinstance(output, compat_str)
1423 if version_re is None:
1424 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1425 m = re.search(version_re, output)
95807118
PH
1426 if m:
1427 return m.group(1)
1428 else:
1429 return unrecognized
1430
1431
b7ab0590 1432class PagedList(object):
dd26ced1
PH
1433 def __len__(self):
1434 # This is only useful for tests
1435 return len(self.getslice())
1436
9c44d242
PH
1437
1438class OnDemandPagedList(PagedList):
1439 def __init__(self, pagefunc, pagesize):
1440 self._pagefunc = pagefunc
1441 self._pagesize = pagesize
1442
b7ab0590
PH
1443 def getslice(self, start=0, end=None):
1444 res = []
1445 for pagenum in itertools.count(start // self._pagesize):
1446 firstid = pagenum * self._pagesize
1447 nextfirstid = pagenum * self._pagesize + self._pagesize
1448 if start >= nextfirstid:
1449 continue
1450
1451 page_results = list(self._pagefunc(pagenum))
1452
1453 startv = (
1454 start % self._pagesize
1455 if firstid <= start < nextfirstid
1456 else 0)
1457
1458 endv = (
1459 ((end - 1) % self._pagesize) + 1
1460 if (end is not None and firstid <= end <= nextfirstid)
1461 else None)
1462
1463 if startv != 0 or endv is not None:
1464 page_results = page_results[startv:endv]
1465 res.extend(page_results)
1466
1467 # A little optimization - if current page is not "full", ie. does
1468 # not contain page_size videos then we can assume that this page
1469 # is the last one - there are no more ids on further pages -
1470 # i.e. no need to query again.
1471 if len(page_results) + startv < self._pagesize:
1472 break
1473
1474 # If we got the whole page, but the next page is not interesting,
1475 # break out early as well
1476 if end == nextfirstid:
1477 break
1478 return res
81c2f20b
PH
1479
1480
9c44d242
PH
1481class InAdvancePagedList(PagedList):
1482 def __init__(self, pagefunc, pagecount, pagesize):
1483 self._pagefunc = pagefunc
1484 self._pagecount = pagecount
1485 self._pagesize = pagesize
1486
1487 def getslice(self, start=0, end=None):
1488 res = []
1489 start_page = start // self._pagesize
1490 end_page = (
1491 self._pagecount if end is None else (end // self._pagesize + 1))
1492 skip_elems = start - start_page * self._pagesize
1493 only_more = None if end is None else end - start
1494 for pagenum in range(start_page, end_page):
1495 page = list(self._pagefunc(pagenum))
1496 if skip_elems:
1497 page = page[skip_elems:]
1498 skip_elems = None
1499 if only_more is not None:
1500 if len(page) < only_more:
1501 only_more -= len(page)
1502 else:
1503 page = page[:only_more]
1504 res.extend(page)
1505 break
1506 res.extend(page)
1507 return res
1508
1509
81c2f20b 1510def uppercase_escape(s):
676eb3f2 1511 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1512 return re.sub(
a612753d 1513 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1514 lambda m: unicode_escape(m.group(0))[0],
1515 s)
0fe2ff78
YCH
1516
1517
1518def lowercase_escape(s):
1519 unicode_escape = codecs.getdecoder('unicode_escape')
1520 return re.sub(
1521 r'\\u[0-9a-fA-F]{4}',
1522 lambda m: unicode_escape(m.group(0))[0],
1523 s)
b53466e1 1524
d05cfe06
S
1525
1526def escape_rfc3986(s):
1527 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1528 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1529 s = s.encode('utf-8')
ecc0c5ee 1530 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1531
1532
1533def escape_url(url):
1534 """Escape URL as suggested by RFC 3986"""
1535 url_parsed = compat_urllib_parse_urlparse(url)
1536 return url_parsed._replace(
1537 path=escape_rfc3986(url_parsed.path),
1538 params=escape_rfc3986(url_parsed.params),
1539 query=escape_rfc3986(url_parsed.query),
1540 fragment=escape_rfc3986(url_parsed.fragment)
1541 ).geturl()
1542
b53466e1 1543try:
28e614de 1544 struct.pack('!I', 0)
b53466e1
PH
1545except TypeError:
1546 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1547 def struct_pack(spec, *args):
1548 if isinstance(spec, compat_str):
1549 spec = spec.encode('ascii')
1550 return struct.pack(spec, *args)
1551
1552 def struct_unpack(spec, *args):
1553 if isinstance(spec, compat_str):
1554 spec = spec.encode('ascii')
1555 return struct.unpack(spec, *args)
1556else:
1557 struct_pack = struct.pack
1558 struct_unpack = struct.unpack
62e609ab
PH
1559
1560
1561def read_batch_urls(batch_fd):
1562 def fixup(url):
1563 if not isinstance(url, compat_str):
1564 url = url.decode('utf-8', 'replace')
28e614de 1565 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1566 if url.startswith(BOM_UTF8):
1567 url = url[len(BOM_UTF8):]
1568 url = url.strip()
1569 if url.startswith(('#', ';', ']')):
1570 return False
1571 return url
1572
1573 with contextlib.closing(batch_fd) as fd:
1574 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1575
1576
1577def urlencode_postdata(*args, **kargs):
1578 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1579
1580
0990305d
PH
1581try:
1582 etree_iter = xml.etree.ElementTree.Element.iter
1583except AttributeError: # Python <=2.6
1584 etree_iter = lambda n: n.findall('.//*')
1585
1586
bcf89ce6
PH
1587def parse_xml(s):
1588 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1589 def doctype(self, name, pubid, system):
1590 pass # Ignore doctypes
1591
1592 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1593 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1594 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1595 # Fix up XML parser in Python 2.x
1596 if sys.version_info < (3, 0):
1597 for n in etree_iter(tree):
1598 if n.text is not None:
1599 if not isinstance(n.text, compat_str):
1600 n.text = n.text.decode('utf-8')
1601 return tree
e68301af
PH
1602
1603
a1a530b0
PH
1604US_RATINGS = {
1605 'G': 0,
1606 'PG': 10,
1607 'PG-13': 13,
1608 'R': 16,
1609 'NC': 18,
1610}
fac55558
PH
1611
1612
146c80e2
S
1613def parse_age_limit(s):
1614 if s is None:
d838b1bd 1615 return None
146c80e2 1616 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1617 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1618
1619
fac55558 1620def strip_jsonp(code):
609a61e3
PH
1621 return re.sub(
1622 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1623
1624
e05f6939
PH
1625def js_to_json(code):
1626 def fix_kv(m):
e7b6d122
PH
1627 v = m.group(0)
1628 if v in ('true', 'false', 'null'):
1629 return v
1630 if v.startswith('"'):
1631 return v
1632 if v.startswith("'"):
1633 v = v[1:-1]
1634 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1635 '\\\\': '\\\\',
1636 "\\'": "'",
1637 '"': '\\"',
1638 }[m.group(0)], v)
1639 return '"%s"' % v
e05f6939
PH
1640
1641 res = re.sub(r'''(?x)
d305dd73
PH
1642 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1643 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1644 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1645 ''', fix_kv, code)
ba9e68f4 1646 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1647 return res
1648
1649
478c2c61
PH
1650def qualities(quality_ids):
1651 """ Get a numeric quality value out of a list of possible values """
1652 def q(qid):
1653 try:
1654 return quality_ids.index(qid)
1655 except ValueError:
1656 return -1
1657 return q
1658
acd69589
PH
1659
1660DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1661
a020a0dc
PH
1662
1663def limit_length(s, length):
1664 """ Add ellipses to overly long strings """
1665 if s is None:
1666 return None
1667 ELLIPSES = '...'
1668 if len(s) > length:
1669 return s[:length - len(ELLIPSES)] + ELLIPSES
1670 return s
48844745
PH
1671
1672
1673def version_tuple(v):
5f9b8394 1674 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1675
1676
1677def is_outdated_version(version, limit, assume_new=True):
1678 if not version:
1679 return not assume_new
1680 try:
1681 return version_tuple(version) < version_tuple(limit)
1682 except ValueError:
1683 return not assume_new
732ea2f0
PH
1684
1685
1686def ytdl_is_updateable():
1687 """ Returns if youtube-dl can be updated with -U """
1688 from zipimport import zipimporter
1689
1690 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1691
1692
1693def args_to_str(args):
1694 # Get a short string representation for a subprocess command
1695 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1696
1697
c460bdd5
PH
1698def mimetype2ext(mt):
1699 _, _, res = mt.rpartition('/')
1700
1701 return {
1702 'x-ms-wmv': 'wmv',
1703 'x-mp4-fragmented': 'mp4',
ecee5724 1704 'ttml+xml': 'ttml',
c460bdd5
PH
1705 }.get(res, res)
1706
1707
2ccd1b10
PH
1708def urlhandle_detect_ext(url_handle):
1709 try:
1710 url_handle.headers
1711 getheader = lambda h: url_handle.headers[h]
1712 except AttributeError: # Python < 3
1713 getheader = url_handle.info().getheader
1714
b55ee18f
PH
1715 cd = getheader('Content-Disposition')
1716 if cd:
1717 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1718 if m:
1719 e = determine_ext(m.group('filename'), default_ext=None)
1720 if e:
1721 return e
1722
c460bdd5 1723 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1724
1725
1726def age_restricted(content_limit, age_limit):
1727 """ Returns True iff the content should be blocked """
1728
1729 if age_limit is None: # No limit set
1730 return False
1731 if content_limit is None:
1732 return False # Content available for everyone
1733 return age_limit < content_limit
61ca9a80
PH
1734
1735
1736def is_html(first_bytes):
1737 """ Detect whether a file contains HTML by examining its first bytes. """
1738
1739 BOMS = [
1740 (b'\xef\xbb\xbf', 'utf-8'),
1741 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1742 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1743 (b'\xff\xfe', 'utf-16-le'),
1744 (b'\xfe\xff', 'utf-16-be'),
1745 ]
1746 for bom, enc in BOMS:
1747 if first_bytes.startswith(bom):
1748 s = first_bytes[len(bom):].decode(enc, 'replace')
1749 break
1750 else:
1751 s = first_bytes.decode('utf-8', 'replace')
1752
1753 return re.match(r'^\s*<', s)
a055469f
PH
1754
1755
1756def determine_protocol(info_dict):
1757 protocol = info_dict.get('protocol')
1758 if protocol is not None:
1759 return protocol
1760
1761 url = info_dict['url']
1762 if url.startswith('rtmp'):
1763 return 'rtmp'
1764 elif url.startswith('mms'):
1765 return 'mms'
1766 elif url.startswith('rtsp'):
1767 return 'rtsp'
1768
1769 ext = determine_ext(url)
1770 if ext == 'm3u8':
1771 return 'm3u8'
1772 elif ext == 'f4m':
1773 return 'f4m'
1774
1775 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1776
1777
1778def render_table(header_row, data):
1779 """ Render a list of rows, each as a list of values """
1780 table = [header_row] + data
1781 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1782 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1783 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1784
1785
1786def _match_one(filter_part, dct):
1787 COMPARISON_OPERATORS = {
1788 '<': operator.lt,
1789 '<=': operator.le,
1790 '>': operator.gt,
1791 '>=': operator.ge,
1792 '=': operator.eq,
1793 '!=': operator.ne,
1794 }
1795 operator_rex = re.compile(r'''(?x)\s*
1796 (?P<key>[a-z_]+)
1797 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1798 (?:
1799 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1800 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1801 )
1802 \s*$
1803 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1804 m = operator_rex.search(filter_part)
1805 if m:
1806 op = COMPARISON_OPERATORS[m.group('op')]
1807 if m.group('strval') is not None:
1808 if m.group('op') not in ('=', '!='):
1809 raise ValueError(
1810 'Operator %s does not support string values!' % m.group('op'))
1811 comparison_value = m.group('strval')
1812 else:
1813 try:
1814 comparison_value = int(m.group('intval'))
1815 except ValueError:
1816 comparison_value = parse_filesize(m.group('intval'))
1817 if comparison_value is None:
1818 comparison_value = parse_filesize(m.group('intval') + 'B')
1819 if comparison_value is None:
1820 raise ValueError(
1821 'Invalid integer value %r in filter part %r' % (
1822 m.group('intval'), filter_part))
1823 actual_value = dct.get(m.group('key'))
1824 if actual_value is None:
1825 return m.group('none_inclusive')
1826 return op(actual_value, comparison_value)
1827
1828 UNARY_OPERATORS = {
1829 '': lambda v: v is not None,
1830 '!': lambda v: v is None,
1831 }
1832 operator_rex = re.compile(r'''(?x)\s*
1833 (?P<op>%s)\s*(?P<key>[a-z_]+)
1834 \s*$
1835 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1836 m = operator_rex.search(filter_part)
1837 if m:
1838 op = UNARY_OPERATORS[m.group('op')]
1839 actual_value = dct.get(m.group('key'))
1840 return op(actual_value)
1841
1842 raise ValueError('Invalid filter part %r' % filter_part)
1843
1844
1845def match_str(filter_str, dct):
1846 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1847
1848 return all(
1849 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1850
1851
1852def match_filter_func(filter_str):
1853 def _match_func(info_dict):
1854 if match_str(filter_str, info_dict):
1855 return None
1856 else:
1857 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1858 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1859 return _match_func
91410c9b
PH
1860
1861
bf6427d2
YCH
1862def parse_dfxp_time_expr(time_expr):
1863 if not time_expr:
1864 return 0.0
1865
1866 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1867 if mobj:
1868 return float(mobj.group('time_offset'))
1869
1870 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1871 if mobj:
1872 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1873
1874
c1c924ab
YCH
1875def srt_subtitles_timecode(seconds):
1876 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
1877
1878
1879def dfxp2srt(dfxp_data):
4e335771
YCH
1880 _x = functools.partial(xpath_with_ns, ns_map={
1881 'ttml': 'http://www.w3.org/ns/ttml',
1882 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1883 })
bf6427d2
YCH
1884
1885 def parse_node(node):
1886 str_or_empty = functools.partial(str_or_none, default='')
1887
1888 out = str_or_empty(node.text)
1889
1890 for child in node:
4e335771 1891 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
bf6427d2 1892 out += '\n' + str_or_empty(child.tail)
4e335771 1893 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
bf6427d2
YCH
1894 out += str_or_empty(parse_node(child))
1895 else:
1896 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1897
1898 return out
1899
1900 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1901 out = []
4e335771 1902 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
1903
1904 if not paras:
1905 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
1906
1907 for para, index in zip(paras, itertools.count(1)):
7dff0363
YCH
1908 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1909 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1910 if not end_time:
1911 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
bf6427d2
YCH
1912 out.append('%d\n%s --> %s\n%s\n\n' % (
1913 index,
c1c924ab
YCH
1914 srt_subtitles_timecode(begin_time),
1915 srt_subtitles_timecode(end_time),
bf6427d2
YCH
1916 parse_node(para)))
1917
1918 return ''.join(out)
1919
1920
39672624
YCH
1921class ISO639Utils(object):
1922 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1923 _lang_map = {
1924 'aa': 'aar',
1925 'ab': 'abk',
1926 'ae': 'ave',
1927 'af': 'afr',
1928 'ak': 'aka',
1929 'am': 'amh',
1930 'an': 'arg',
1931 'ar': 'ara',
1932 'as': 'asm',
1933 'av': 'ava',
1934 'ay': 'aym',
1935 'az': 'aze',
1936 'ba': 'bak',
1937 'be': 'bel',
1938 'bg': 'bul',
1939 'bh': 'bih',
1940 'bi': 'bis',
1941 'bm': 'bam',
1942 'bn': 'ben',
1943 'bo': 'bod',
1944 'br': 'bre',
1945 'bs': 'bos',
1946 'ca': 'cat',
1947 'ce': 'che',
1948 'ch': 'cha',
1949 'co': 'cos',
1950 'cr': 'cre',
1951 'cs': 'ces',
1952 'cu': 'chu',
1953 'cv': 'chv',
1954 'cy': 'cym',
1955 'da': 'dan',
1956 'de': 'deu',
1957 'dv': 'div',
1958 'dz': 'dzo',
1959 'ee': 'ewe',
1960 'el': 'ell',
1961 'en': 'eng',
1962 'eo': 'epo',
1963 'es': 'spa',
1964 'et': 'est',
1965 'eu': 'eus',
1966 'fa': 'fas',
1967 'ff': 'ful',
1968 'fi': 'fin',
1969 'fj': 'fij',
1970 'fo': 'fao',
1971 'fr': 'fra',
1972 'fy': 'fry',
1973 'ga': 'gle',
1974 'gd': 'gla',
1975 'gl': 'glg',
1976 'gn': 'grn',
1977 'gu': 'guj',
1978 'gv': 'glv',
1979 'ha': 'hau',
1980 'he': 'heb',
1981 'hi': 'hin',
1982 'ho': 'hmo',
1983 'hr': 'hrv',
1984 'ht': 'hat',
1985 'hu': 'hun',
1986 'hy': 'hye',
1987 'hz': 'her',
1988 'ia': 'ina',
1989 'id': 'ind',
1990 'ie': 'ile',
1991 'ig': 'ibo',
1992 'ii': 'iii',
1993 'ik': 'ipk',
1994 'io': 'ido',
1995 'is': 'isl',
1996 'it': 'ita',
1997 'iu': 'iku',
1998 'ja': 'jpn',
1999 'jv': 'jav',
2000 'ka': 'kat',
2001 'kg': 'kon',
2002 'ki': 'kik',
2003 'kj': 'kua',
2004 'kk': 'kaz',
2005 'kl': 'kal',
2006 'km': 'khm',
2007 'kn': 'kan',
2008 'ko': 'kor',
2009 'kr': 'kau',
2010 'ks': 'kas',
2011 'ku': 'kur',
2012 'kv': 'kom',
2013 'kw': 'cor',
2014 'ky': 'kir',
2015 'la': 'lat',
2016 'lb': 'ltz',
2017 'lg': 'lug',
2018 'li': 'lim',
2019 'ln': 'lin',
2020 'lo': 'lao',
2021 'lt': 'lit',
2022 'lu': 'lub',
2023 'lv': 'lav',
2024 'mg': 'mlg',
2025 'mh': 'mah',
2026 'mi': 'mri',
2027 'mk': 'mkd',
2028 'ml': 'mal',
2029 'mn': 'mon',
2030 'mr': 'mar',
2031 'ms': 'msa',
2032 'mt': 'mlt',
2033 'my': 'mya',
2034 'na': 'nau',
2035 'nb': 'nob',
2036 'nd': 'nde',
2037 'ne': 'nep',
2038 'ng': 'ndo',
2039 'nl': 'nld',
2040 'nn': 'nno',
2041 'no': 'nor',
2042 'nr': 'nbl',
2043 'nv': 'nav',
2044 'ny': 'nya',
2045 'oc': 'oci',
2046 'oj': 'oji',
2047 'om': 'orm',
2048 'or': 'ori',
2049 'os': 'oss',
2050 'pa': 'pan',
2051 'pi': 'pli',
2052 'pl': 'pol',
2053 'ps': 'pus',
2054 'pt': 'por',
2055 'qu': 'que',
2056 'rm': 'roh',
2057 'rn': 'run',
2058 'ro': 'ron',
2059 'ru': 'rus',
2060 'rw': 'kin',
2061 'sa': 'san',
2062 'sc': 'srd',
2063 'sd': 'snd',
2064 'se': 'sme',
2065 'sg': 'sag',
2066 'si': 'sin',
2067 'sk': 'slk',
2068 'sl': 'slv',
2069 'sm': 'smo',
2070 'sn': 'sna',
2071 'so': 'som',
2072 'sq': 'sqi',
2073 'sr': 'srp',
2074 'ss': 'ssw',
2075 'st': 'sot',
2076 'su': 'sun',
2077 'sv': 'swe',
2078 'sw': 'swa',
2079 'ta': 'tam',
2080 'te': 'tel',
2081 'tg': 'tgk',
2082 'th': 'tha',
2083 'ti': 'tir',
2084 'tk': 'tuk',
2085 'tl': 'tgl',
2086 'tn': 'tsn',
2087 'to': 'ton',
2088 'tr': 'tur',
2089 'ts': 'tso',
2090 'tt': 'tat',
2091 'tw': 'twi',
2092 'ty': 'tah',
2093 'ug': 'uig',
2094 'uk': 'ukr',
2095 'ur': 'urd',
2096 'uz': 'uzb',
2097 've': 'ven',
2098 'vi': 'vie',
2099 'vo': 'vol',
2100 'wa': 'wln',
2101 'wo': 'wol',
2102 'xh': 'xho',
2103 'yi': 'yid',
2104 'yo': 'yor',
2105 'za': 'zha',
2106 'zh': 'zho',
2107 'zu': 'zul',
2108 }
2109
2110 @classmethod
2111 def short2long(cls, code):
2112 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2113 return cls._lang_map.get(code[:2])
2114
2115 @classmethod
2116 def long2short(cls, code):
2117 """Convert language code from ISO 639-2/T to ISO 639-1"""
2118 for short_name, long_name in cls._lang_map.items():
2119 if long_name == code:
2120 return short_name
2121
2122
4eb10f66
YCH
2123class ISO3166Utils(object):
2124 # From http://data.okfn.org/data/core/country-list
2125 _country_map = {
2126 'AF': 'Afghanistan',
2127 'AX': 'Åland Islands',
2128 'AL': 'Albania',
2129 'DZ': 'Algeria',
2130 'AS': 'American Samoa',
2131 'AD': 'Andorra',
2132 'AO': 'Angola',
2133 'AI': 'Anguilla',
2134 'AQ': 'Antarctica',
2135 'AG': 'Antigua and Barbuda',
2136 'AR': 'Argentina',
2137 'AM': 'Armenia',
2138 'AW': 'Aruba',
2139 'AU': 'Australia',
2140 'AT': 'Austria',
2141 'AZ': 'Azerbaijan',
2142 'BS': 'Bahamas',
2143 'BH': 'Bahrain',
2144 'BD': 'Bangladesh',
2145 'BB': 'Barbados',
2146 'BY': 'Belarus',
2147 'BE': 'Belgium',
2148 'BZ': 'Belize',
2149 'BJ': 'Benin',
2150 'BM': 'Bermuda',
2151 'BT': 'Bhutan',
2152 'BO': 'Bolivia, Plurinational State of',
2153 'BQ': 'Bonaire, Sint Eustatius and Saba',
2154 'BA': 'Bosnia and Herzegovina',
2155 'BW': 'Botswana',
2156 'BV': 'Bouvet Island',
2157 'BR': 'Brazil',
2158 'IO': 'British Indian Ocean Territory',
2159 'BN': 'Brunei Darussalam',
2160 'BG': 'Bulgaria',
2161 'BF': 'Burkina Faso',
2162 'BI': 'Burundi',
2163 'KH': 'Cambodia',
2164 'CM': 'Cameroon',
2165 'CA': 'Canada',
2166 'CV': 'Cape Verde',
2167 'KY': 'Cayman Islands',
2168 'CF': 'Central African Republic',
2169 'TD': 'Chad',
2170 'CL': 'Chile',
2171 'CN': 'China',
2172 'CX': 'Christmas Island',
2173 'CC': 'Cocos (Keeling) Islands',
2174 'CO': 'Colombia',
2175 'KM': 'Comoros',
2176 'CG': 'Congo',
2177 'CD': 'Congo, the Democratic Republic of the',
2178 'CK': 'Cook Islands',
2179 'CR': 'Costa Rica',
2180 'CI': 'Côte d\'Ivoire',
2181 'HR': 'Croatia',
2182 'CU': 'Cuba',
2183 'CW': 'Curaçao',
2184 'CY': 'Cyprus',
2185 'CZ': 'Czech Republic',
2186 'DK': 'Denmark',
2187 'DJ': 'Djibouti',
2188 'DM': 'Dominica',
2189 'DO': 'Dominican Republic',
2190 'EC': 'Ecuador',
2191 'EG': 'Egypt',
2192 'SV': 'El Salvador',
2193 'GQ': 'Equatorial Guinea',
2194 'ER': 'Eritrea',
2195 'EE': 'Estonia',
2196 'ET': 'Ethiopia',
2197 'FK': 'Falkland Islands (Malvinas)',
2198 'FO': 'Faroe Islands',
2199 'FJ': 'Fiji',
2200 'FI': 'Finland',
2201 'FR': 'France',
2202 'GF': 'French Guiana',
2203 'PF': 'French Polynesia',
2204 'TF': 'French Southern Territories',
2205 'GA': 'Gabon',
2206 'GM': 'Gambia',
2207 'GE': 'Georgia',
2208 'DE': 'Germany',
2209 'GH': 'Ghana',
2210 'GI': 'Gibraltar',
2211 'GR': 'Greece',
2212 'GL': 'Greenland',
2213 'GD': 'Grenada',
2214 'GP': 'Guadeloupe',
2215 'GU': 'Guam',
2216 'GT': 'Guatemala',
2217 'GG': 'Guernsey',
2218 'GN': 'Guinea',
2219 'GW': 'Guinea-Bissau',
2220 'GY': 'Guyana',
2221 'HT': 'Haiti',
2222 'HM': 'Heard Island and McDonald Islands',
2223 'VA': 'Holy See (Vatican City State)',
2224 'HN': 'Honduras',
2225 'HK': 'Hong Kong',
2226 'HU': 'Hungary',
2227 'IS': 'Iceland',
2228 'IN': 'India',
2229 'ID': 'Indonesia',
2230 'IR': 'Iran, Islamic Republic of',
2231 'IQ': 'Iraq',
2232 'IE': 'Ireland',
2233 'IM': 'Isle of Man',
2234 'IL': 'Israel',
2235 'IT': 'Italy',
2236 'JM': 'Jamaica',
2237 'JP': 'Japan',
2238 'JE': 'Jersey',
2239 'JO': 'Jordan',
2240 'KZ': 'Kazakhstan',
2241 'KE': 'Kenya',
2242 'KI': 'Kiribati',
2243 'KP': 'Korea, Democratic People\'s Republic of',
2244 'KR': 'Korea, Republic of',
2245 'KW': 'Kuwait',
2246 'KG': 'Kyrgyzstan',
2247 'LA': 'Lao People\'s Democratic Republic',
2248 'LV': 'Latvia',
2249 'LB': 'Lebanon',
2250 'LS': 'Lesotho',
2251 'LR': 'Liberia',
2252 'LY': 'Libya',
2253 'LI': 'Liechtenstein',
2254 'LT': 'Lithuania',
2255 'LU': 'Luxembourg',
2256 'MO': 'Macao',
2257 'MK': 'Macedonia, the Former Yugoslav Republic of',
2258 'MG': 'Madagascar',
2259 'MW': 'Malawi',
2260 'MY': 'Malaysia',
2261 'MV': 'Maldives',
2262 'ML': 'Mali',
2263 'MT': 'Malta',
2264 'MH': 'Marshall Islands',
2265 'MQ': 'Martinique',
2266 'MR': 'Mauritania',
2267 'MU': 'Mauritius',
2268 'YT': 'Mayotte',
2269 'MX': 'Mexico',
2270 'FM': 'Micronesia, Federated States of',
2271 'MD': 'Moldova, Republic of',
2272 'MC': 'Monaco',
2273 'MN': 'Mongolia',
2274 'ME': 'Montenegro',
2275 'MS': 'Montserrat',
2276 'MA': 'Morocco',
2277 'MZ': 'Mozambique',
2278 'MM': 'Myanmar',
2279 'NA': 'Namibia',
2280 'NR': 'Nauru',
2281 'NP': 'Nepal',
2282 'NL': 'Netherlands',
2283 'NC': 'New Caledonia',
2284 'NZ': 'New Zealand',
2285 'NI': 'Nicaragua',
2286 'NE': 'Niger',
2287 'NG': 'Nigeria',
2288 'NU': 'Niue',
2289 'NF': 'Norfolk Island',
2290 'MP': 'Northern Mariana Islands',
2291 'NO': 'Norway',
2292 'OM': 'Oman',
2293 'PK': 'Pakistan',
2294 'PW': 'Palau',
2295 'PS': 'Palestine, State of',
2296 'PA': 'Panama',
2297 'PG': 'Papua New Guinea',
2298 'PY': 'Paraguay',
2299 'PE': 'Peru',
2300 'PH': 'Philippines',
2301 'PN': 'Pitcairn',
2302 'PL': 'Poland',
2303 'PT': 'Portugal',
2304 'PR': 'Puerto Rico',
2305 'QA': 'Qatar',
2306 'RE': 'Réunion',
2307 'RO': 'Romania',
2308 'RU': 'Russian Federation',
2309 'RW': 'Rwanda',
2310 'BL': 'Saint Barthélemy',
2311 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2312 'KN': 'Saint Kitts and Nevis',
2313 'LC': 'Saint Lucia',
2314 'MF': 'Saint Martin (French part)',
2315 'PM': 'Saint Pierre and Miquelon',
2316 'VC': 'Saint Vincent and the Grenadines',
2317 'WS': 'Samoa',
2318 'SM': 'San Marino',
2319 'ST': 'Sao Tome and Principe',
2320 'SA': 'Saudi Arabia',
2321 'SN': 'Senegal',
2322 'RS': 'Serbia',
2323 'SC': 'Seychelles',
2324 'SL': 'Sierra Leone',
2325 'SG': 'Singapore',
2326 'SX': 'Sint Maarten (Dutch part)',
2327 'SK': 'Slovakia',
2328 'SI': 'Slovenia',
2329 'SB': 'Solomon Islands',
2330 'SO': 'Somalia',
2331 'ZA': 'South Africa',
2332 'GS': 'South Georgia and the South Sandwich Islands',
2333 'SS': 'South Sudan',
2334 'ES': 'Spain',
2335 'LK': 'Sri Lanka',
2336 'SD': 'Sudan',
2337 'SR': 'Suriname',
2338 'SJ': 'Svalbard and Jan Mayen',
2339 'SZ': 'Swaziland',
2340 'SE': 'Sweden',
2341 'CH': 'Switzerland',
2342 'SY': 'Syrian Arab Republic',
2343 'TW': 'Taiwan, Province of China',
2344 'TJ': 'Tajikistan',
2345 'TZ': 'Tanzania, United Republic of',
2346 'TH': 'Thailand',
2347 'TL': 'Timor-Leste',
2348 'TG': 'Togo',
2349 'TK': 'Tokelau',
2350 'TO': 'Tonga',
2351 'TT': 'Trinidad and Tobago',
2352 'TN': 'Tunisia',
2353 'TR': 'Turkey',
2354 'TM': 'Turkmenistan',
2355 'TC': 'Turks and Caicos Islands',
2356 'TV': 'Tuvalu',
2357 'UG': 'Uganda',
2358 'UA': 'Ukraine',
2359 'AE': 'United Arab Emirates',
2360 'GB': 'United Kingdom',
2361 'US': 'United States',
2362 'UM': 'United States Minor Outlying Islands',
2363 'UY': 'Uruguay',
2364 'UZ': 'Uzbekistan',
2365 'VU': 'Vanuatu',
2366 'VE': 'Venezuela, Bolivarian Republic of',
2367 'VN': 'Viet Nam',
2368 'VG': 'Virgin Islands, British',
2369 'VI': 'Virgin Islands, U.S.',
2370 'WF': 'Wallis and Futuna',
2371 'EH': 'Western Sahara',
2372 'YE': 'Yemen',
2373 'ZM': 'Zambia',
2374 'ZW': 'Zimbabwe',
2375 }
2376
2377 @classmethod
2378 def short2full(cls, code):
2379 """Convert an ISO 3166-2 country code to the corresponding full name"""
2380 return cls._country_map.get(code.upper())
2381
2382
91410c9b 2383class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2384 def __init__(self, proxies=None):
2385 # Set default handlers
2386 for type in ('http', 'https'):
2387 setattr(self, '%s_open' % type,
2388 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2389 meth(r, proxy, type))
2390 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2391
91410c9b 2392 def proxy_open(self, req, proxy, type):
2461f79d 2393 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2394 if req_proxy is not None:
2395 proxy = req_proxy
2461f79d
PH
2396 del req.headers['Ytdl-request-proxy']
2397
2398 if proxy == '__noproxy__':
2399 return None # No Proxy
91410c9b
PH
2400 return compat_urllib_request.ProxyHandler.proxy_open(
2401 self, req, proxy, type)