]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[youtube] Use 'vp8' and 'vp9' in lowercase (fixes #6358)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
bf42a990
S
65NO_DEFAULT = object()
66
7105440c
YCH
67ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
70
71
d77c3dfd 72def preferredencoding():
59ae15a5 73 """Get preferred encoding.
d77c3dfd 74
59ae15a5
PH
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
78 try:
79 pref = locale.getpreferredencoding()
28e614de 80 'TEST'.encode(pref)
70a1165b 81 except Exception:
59ae15a5 82 pref = 'UTF-8'
bae611f2 83
59ae15a5 84 return pref
d77c3dfd 85
f4bfd65f 86
181c8655 87def write_json_file(obj, fn):
1394646a 88 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 89
92120217 90 fn = encodeFilename(fn)
61ee5aeb 91 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
99 else:
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
102
73159f99
S
103 args = {
104 'suffix': '.tmp',
ec5f6016
JMF
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
73159f99
S
107 'delete': False,
108 }
109
181c8655
PH
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
73159f99 113 args['mode'] = 'wb'
181c8655 114 else:
73159f99
S
115 args.update({
116 'mode': 'w',
117 'encoding': 'utf-8',
118 })
119
c86b6142 120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
121
122 try:
123 with tf:
124 json.dump(obj, tf)
1394646a
IK
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
128 try:
129 os.unlink(fn)
130 except OSError:
131 pass
181c8655 132 os.rename(tf.name, fn)
70a1165b 133 except Exception:
181c8655
PH
134 try:
135 os.remove(tf.name)
136 except OSError:
137 pass
138 raise
139
140
141if sys.version_info >= (2, 7):
59ae56fa
PH
142 def find_xpath_attr(node, xpath, key, val):
143 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
144 assert re.match(r'^[a-zA-Z-]+$', key)
145 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 146 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
147 return node.find(expr)
148else:
149 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
150 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
151 # .//node does not match if a node is a direct child of . !
8f9312c3 152 if isinstance(xpath, compat_str):
4eefbfdb
PH
153 xpath = xpath.encode('ascii')
154
59ae56fa
PH
155 for f in node.findall(xpath):
156 if f.attrib.get(key) == val:
157 return f
158 return None
159
d7e66d39
JMF
160# On python2.6 the xml.etree.ElementTree.Element methods don't support
161# the namespace parameter
5f6a1245
JW
162
163
d7e66d39
JMF
164def xpath_with_ns(path, ns_map):
165 components = [c.split(':') for c in path.split('/')]
166 replaced = []
167 for c in components:
168 if len(c) == 1:
169 replaced.append(c[0])
170 else:
171 ns, tag = c
172 replaced.append('{%s}%s' % (ns_map[ns], tag))
173 return '/'.join(replaced)
174
d77c3dfd 175
bf42a990 176def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
d74bebd5
PH
177 if sys.version_info < (2, 7): # Crazy 2.6
178 xpath = xpath.encode('ascii')
179
bf0ff932 180 n = node.find(xpath)
42bdd9d0 181 if n is None or n.text is None:
bf42a990
S
182 if default is not NO_DEFAULT:
183 return default
184 elif fatal:
bf0ff932
PH
185 name = xpath if name is None else name
186 raise ExtractorError('Could not find XML element %s' % name)
187 else:
188 return None
189 return n.text
190
191
9e6dd238 192def get_element_by_id(id, html):
43e8fafd
ND
193 """Return the content of the tag with the specified ID in the passed HTML document"""
194 return get_element_by_attribute("id", id, html)
195
12ea2f30 196
43e8fafd
ND
197def get_element_by_attribute(attribute, value, html):
198 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 199
38285056
PH
200 m = re.search(r'''(?xs)
201 <([a-zA-Z0-9:._-]+)
202 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 \s+%s=['"]?%s['"]?
204 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
205 \s*>
206 (?P<content>.*?)
207 </\1>
208 ''' % (re.escape(attribute), re.escape(value)), html)
209
210 if not m:
211 return None
212 res = m.group('content')
213
214 if res.startswith('"') or res.startswith("'"):
215 res = res[1:-1]
a921f407 216
38285056 217 return unescapeHTML(res)
a921f407 218
9e6dd238
FV
219
220def clean_html(html):
59ae15a5 221 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
222
223 if html is None: # Convenience for sanitizing descriptions etc.
224 return html
225
59ae15a5
PH
226 # Newline vs <br />
227 html = html.replace('\n', ' ')
6b3aef80
FV
228 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
229 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
230 # Strip html tags
231 html = re.sub('<.*?>', '', html)
232 # Replace html entities
233 html = unescapeHTML(html)
7decf895 234 return html.strip()
9e6dd238
FV
235
236
d77c3dfd 237def sanitize_open(filename, open_mode):
59ae15a5
PH
238 """Try to open the given filename, and slightly tweak it if this fails.
239
240 Attempts to open the given filename. If this fails, it tries to change
241 the filename slightly, step by step, until it's either able to open it
242 or it fails and raises a final exception, like the standard open()
243 function.
244
245 It returns the tuple (stream, definitive_file_name).
246 """
247 try:
28e614de 248 if filename == '-':
59ae15a5
PH
249 if sys.platform == 'win32':
250 import msvcrt
251 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 252 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
253 stream = open(encodeFilename(filename), open_mode)
254 return (stream, filename)
255 except (IOError, OSError) as err:
f45c185f
PH
256 if err.errno in (errno.EACCES,):
257 raise
59ae15a5 258
f45c185f 259 # In case of error, try to remove win32 forbidden chars
d55de57b 260 alt_filename = sanitize_path(filename)
f45c185f
PH
261 if alt_filename == filename:
262 raise
263 else:
264 # An exception here should be caught in the caller
d55de57b 265 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 266 return (stream, alt_filename)
d77c3dfd
FV
267
268
269def timeconvert(timestr):
59ae15a5
PH
270 """Convert RFC 2822 defined time string into system timestamp"""
271 timestamp = None
272 timetuple = email.utils.parsedate_tz(timestr)
273 if timetuple is not None:
274 timestamp = email.utils.mktime_tz(timetuple)
275 return timestamp
1c469a94 276
5f6a1245 277
796173d0 278def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
279 """Sanitizes a string so it could be used as part of a filename.
280 If restricted is set, use a stricter subset of allowed characters.
796173d0 281 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
282 """
283 def replace_insane(char):
284 if char == '?' or ord(char) < 32 or ord(char) == 127:
285 return ''
286 elif char == '"':
287 return '' if restricted else '\''
288 elif char == ':':
289 return '_-' if restricted else ' -'
290 elif char in '\\/|*<>':
291 return '_'
627dcfff 292 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
293 return '_'
294 if restricted and ord(char) > 127:
295 return '_'
296 return char
297
2aeb06d6
PH
298 # Handle timestamps
299 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 300 result = ''.join(map(replace_insane, s))
796173d0
PH
301 if not is_id:
302 while '__' in result:
303 result = result.replace('__', '_')
304 result = result.strip('_')
305 # Common case of "Foreign band name - English song title"
306 if restricted and result.startswith('-_'):
307 result = result[2:]
5a42414b
PH
308 if result.startswith('-'):
309 result = '_' + result[len('-'):]
a7440261 310 result = result.lstrip('.')
796173d0
PH
311 if not result:
312 result = '_'
59ae15a5 313 return result
d77c3dfd 314
5f6a1245 315
a2aaf4db
S
316def sanitize_path(s):
317 """Sanitizes and normalizes path on Windows"""
318 if sys.platform != 'win32':
319 return s
be531ef1
S
320 drive_or_unc, _ = os.path.splitdrive(s)
321 if sys.version_info < (2, 7) and not drive_or_unc:
322 drive_or_unc, _ = os.path.splitunc(s)
323 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
324 if drive_or_unc:
a2aaf4db
S
325 norm_path.pop(0)
326 sanitized_path = [
2ebfeaca 327 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 328 for path_part in norm_path]
be531ef1
S
329 if drive_or_unc:
330 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
331 return os.path.join(*sanitized_path)
332
333
d77c3dfd 334def orderedSet(iterable):
59ae15a5
PH
335 """ Remove all duplicates from the input iterable """
336 res = []
337 for el in iterable:
338 if el not in res:
339 res.append(el)
340 return res
d77c3dfd 341
912b38b4 342
4e408e47
PH
343def _htmlentity_transform(entity):
344 """Transforms an HTML entity to a character."""
345 # Known non-numeric HTML entity
346 if entity in compat_html_entities.name2codepoint:
347 return compat_chr(compat_html_entities.name2codepoint[entity])
348
91757b0f 349 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
350 if mobj is not None:
351 numstr = mobj.group(1)
28e614de 352 if numstr.startswith('x'):
4e408e47 353 base = 16
28e614de 354 numstr = '0%s' % numstr
4e408e47
PH
355 else:
356 base = 10
357 return compat_chr(int(numstr, base))
358
359 # Unknown entity in name, return its literal representation
28e614de 360 return ('&%s;' % entity)
4e408e47
PH
361
362
d77c3dfd 363def unescapeHTML(s):
912b38b4
PH
364 if s is None:
365 return None
366 assert type(s) == compat_str
d77c3dfd 367
4e408e47
PH
368 return re.sub(
369 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 370
8bf48f23 371
aa49acd1
S
372def get_subprocess_encoding():
373 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
374 # For subprocess calls, encode with locale encoding
375 # Refer to http://stackoverflow.com/a/9951851/35070
376 encoding = preferredencoding()
377 else:
378 encoding = sys.getfilesystemencoding()
379 if encoding is None:
380 encoding = 'utf-8'
381 return encoding
382
383
8bf48f23 384def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
385 """
386 @param s The name of the file
387 """
d77c3dfd 388
8bf48f23 389 assert type(s) == compat_str
d77c3dfd 390
59ae15a5
PH
391 # Python 3 has a Unicode API
392 if sys.version_info >= (3, 0):
393 return s
0f00efed 394
aa49acd1
S
395 # Pass '' directly to use Unicode APIs on Windows 2000 and up
396 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
397 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
398 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
399 return s
400
401 return s.encode(get_subprocess_encoding(), 'ignore')
402
403
404def decodeFilename(b, for_subprocess=False):
405
406 if sys.version_info >= (3, 0):
407 return b
408
409 if not isinstance(b, bytes):
410 return b
411
412 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 413
f07b74fc
PH
414
415def encodeArgument(s):
416 if not isinstance(s, compat_str):
417 # Legacy code that uses byte strings
418 # Uncomment the following line after fixing all post processors
7af808a5 419 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
420 s = s.decode('ascii')
421 return encodeFilename(s, True)
422
423
aa49acd1
S
424def decodeArgument(b):
425 return decodeFilename(b, True)
426
427
8271226a
PH
428def decodeOption(optval):
429 if optval is None:
430 return optval
431 if isinstance(optval, bytes):
432 optval = optval.decode(preferredencoding())
433
434 assert isinstance(optval, compat_str)
435 return optval
1c256f70 436
5f6a1245 437
4539dd30
PH
438def formatSeconds(secs):
439 if secs > 3600:
440 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
441 elif secs > 60:
442 return '%d:%02d' % (secs // 60, secs % 60)
443 else:
444 return '%d' % secs
445
a0ddb8a2 446
be4a824d
PH
447def make_HTTPS_handler(params, **kwargs):
448 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 449 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 450 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 451 if opts_no_check_certificate:
be5f2c19 452 context.check_hostname = False
0db261ba 453 context.verify_mode = ssl.CERT_NONE
a2366922 454 try:
be4a824d 455 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
456 except TypeError:
457 # Python 2.7.8
458 # (create_default_context present but HTTPSHandler has no context=)
459 pass
460
461 if sys.version_info < (3, 2):
d7932313 462 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 463 else: # Python < 3.4
d7932313 464 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 465 context.verify_mode = (ssl.CERT_NONE
dca08720 466 if opts_no_check_certificate
ea6d901e 467 else ssl.CERT_REQUIRED)
303b479e 468 context.set_default_verify_paths()
be4a824d 469 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 470
732ea2f0 471
08f2a92c
JMF
472def bug_reports_message():
473 if ytdl_is_updateable():
474 update_cmd = 'type youtube-dl -U to update'
475 else:
476 update_cmd = 'see https://yt-dl.org/update on how to update'
477 msg = '; please report this issue on https://yt-dl.org/bug .'
478 msg += ' Make sure you are using the latest version; %s.' % update_cmd
479 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
480 return msg
481
482
1c256f70
PH
483class ExtractorError(Exception):
484 """Error during info extraction."""
5f6a1245 485
d11271dd 486 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
487 """ tb, if given, is the original traceback (so that it can be printed out).
488 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
489 """
490
491 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
492 expected = True
d11271dd
PH
493 if video_id is not None:
494 msg = video_id + ': ' + msg
410f3e73 495 if cause:
28e614de 496 msg += ' (caused by %r)' % cause
9a82b238 497 if not expected:
08f2a92c 498 msg += bug_reports_message()
1c256f70 499 super(ExtractorError, self).__init__(msg)
d5979c5d 500
1c256f70 501 self.traceback = tb
8cc83b8d 502 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 503 self.cause = cause
d11271dd 504 self.video_id = video_id
1c256f70 505
01951dda
PH
506 def format_traceback(self):
507 if self.traceback is None:
508 return None
28e614de 509 return ''.join(traceback.format_tb(self.traceback))
01951dda 510
1c256f70 511
416c7fcb
PH
512class UnsupportedError(ExtractorError):
513 def __init__(self, url):
514 super(UnsupportedError, self).__init__(
515 'Unsupported URL: %s' % url, expected=True)
516 self.url = url
517
518
55b3e45b
JMF
519class RegexNotFoundError(ExtractorError):
520 """Error when a regex didn't match"""
521 pass
522
523
d77c3dfd 524class DownloadError(Exception):
59ae15a5 525 """Download Error exception.
d77c3dfd 526
59ae15a5
PH
527 This exception may be thrown by FileDownloader objects if they are not
528 configured to continue on errors. They will contain the appropriate
529 error message.
530 """
5f6a1245 531
8cc83b8d
FV
532 def __init__(self, msg, exc_info=None):
533 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
534 super(DownloadError, self).__init__(msg)
535 self.exc_info = exc_info
d77c3dfd
FV
536
537
538class SameFileError(Exception):
59ae15a5 539 """Same File exception.
d77c3dfd 540
59ae15a5
PH
541 This exception will be thrown by FileDownloader objects if they detect
542 multiple files would have to be downloaded to the same file on disk.
543 """
544 pass
d77c3dfd
FV
545
546
547class PostProcessingError(Exception):
59ae15a5 548 """Post Processing exception.
d77c3dfd 549
59ae15a5
PH
550 This exception may be raised by PostProcessor's .run() method to
551 indicate an error in the postprocessing task.
552 """
5f6a1245 553
7851b379
PH
554 def __init__(self, msg):
555 self.msg = msg
d77c3dfd 556
5f6a1245 557
d77c3dfd 558class MaxDownloadsReached(Exception):
59ae15a5
PH
559 """ --max-downloads limit has been reached. """
560 pass
d77c3dfd
FV
561
562
563class UnavailableVideoError(Exception):
59ae15a5 564 """Unavailable Format exception.
d77c3dfd 565
59ae15a5
PH
566 This exception will be thrown when a video is requested
567 in a format that is not available for that video.
568 """
569 pass
d77c3dfd
FV
570
571
572class ContentTooShortError(Exception):
59ae15a5 573 """Content Too Short exception.
d77c3dfd 574
59ae15a5
PH
575 This exception may be raised by FileDownloader objects when a file they
576 download is too small for what the server announced first, indicating
577 the connection was probably interrupted.
578 """
579 # Both in bytes
580 downloaded = None
581 expected = None
d77c3dfd 582
59ae15a5
PH
583 def __init__(self, downloaded, expected):
584 self.downloaded = downloaded
585 self.expected = expected
d77c3dfd 586
5f6a1245 587
c5a59d93 588def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
589 hc = http_class(*args, **kwargs)
590 source_address = ydl_handler._params.get('source_address')
591 if source_address is not None:
592 sa = (source_address, 0)
593 if hasattr(hc, 'source_address'): # Python 2.7+
594 hc.source_address = sa
595 else: # Python 2.6
596 def _hc_connect(self, *args, **kwargs):
597 sock = compat_socket_create_connection(
598 (self.host, self.port), self.timeout, sa)
599 if is_https:
d7932313
PH
600 self.sock = ssl.wrap_socket(
601 sock, self.key_file, self.cert_file,
602 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
603 else:
604 self.sock = sock
605 hc.connect = functools.partial(_hc_connect, hc)
606
607 return hc
608
609
acebc9cd 610class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
611 """Handler for HTTP requests and responses.
612
613 This class, when installed with an OpenerDirector, automatically adds
614 the standard headers to every HTTP request and handles gzipped and
615 deflated responses from web servers. If compression is to be avoided in
616 a particular request, the original request in the program code only has
617 to include the HTTP header "Youtubedl-No-Compression", which will be
618 removed before making the real request.
619
620 Part of this code was copied from:
621
622 http://techknack.net/python-urllib2-handlers/
623
624 Andrew Rowls, the author of that code, agreed to release it to the
625 public domain.
626 """
627
be4a824d
PH
628 def __init__(self, params, *args, **kwargs):
629 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
630 self._params = params
631
632 def http_open(self, req):
633 return self.do_open(functools.partial(
c5a59d93 634 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
635 req)
636
59ae15a5
PH
637 @staticmethod
638 def deflate(data):
639 try:
640 return zlib.decompress(data, -zlib.MAX_WBITS)
641 except zlib.error:
642 return zlib.decompress(data)
643
644 @staticmethod
645 def addinfourl_wrapper(stream, headers, url, code):
646 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
647 return compat_urllib_request.addinfourl(stream, headers, url, code)
648 ret = compat_urllib_request.addinfourl(stream, headers, url)
649 ret.code = code
650 return ret
651
acebc9cd 652 def http_request(self, req):
33ac271b 653 for h, v in std_headers.items():
3d5f7a39
JK
654 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
655 # The dict keys are capitalized because of this bug by urllib
656 if h.capitalize() not in req.headers:
33ac271b 657 req.add_header(h, v)
59ae15a5
PH
658 if 'Youtubedl-no-compression' in req.headers:
659 if 'Accept-encoding' in req.headers:
660 del req.headers['Accept-encoding']
661 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
662
663 if sys.version_info < (2, 7) and '#' in req.get_full_url():
664 # Python 2.6 is brain-dead when it comes to fragments
665 req._Request__original = req._Request__original.partition('#')[0]
666 req._Request__r_type = req._Request__r_type.partition('#')[0]
667
59ae15a5
PH
668 return req
669
acebc9cd 670 def http_response(self, req, resp):
59ae15a5
PH
671 old_resp = resp
672 # gzip
673 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
674 content = resp.read()
675 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
676 try:
677 uncompressed = io.BytesIO(gz.read())
678 except IOError as original_ioerror:
679 # There may be junk add the end of the file
680 # See http://stackoverflow.com/q/4928560/35070 for details
681 for i in range(1, 1024):
682 try:
683 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
684 uncompressed = io.BytesIO(gz.read())
685 except IOError:
686 continue
687 break
688 else:
689 raise original_ioerror
690 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
691 resp.msg = old_resp.msg
692 # deflate
693 if resp.headers.get('Content-encoding', '') == 'deflate':
694 gz = io.BytesIO(self.deflate(resp.read()))
695 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
696 resp.msg = old_resp.msg
697 return resp
0f8d03f8 698
acebc9cd
PH
699 https_request = http_request
700 https_response = http_response
bf50b038 701
5de90176 702
be4a824d
PH
703class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
704 def __init__(self, params, https_conn_class=None, *args, **kwargs):
705 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
706 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
707 self._params = params
708
709 def https_open(self, req):
4f264c02
JMF
710 kwargs = {}
711 if hasattr(self, '_context'): # python > 2.6
712 kwargs['context'] = self._context
713 if hasattr(self, '_check_hostname'): # python 3.x
714 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
715 return self.do_open(functools.partial(
716 _create_http_connection, self, self._https_conn_class, True),
4f264c02 717 req, **kwargs)
be4a824d
PH
718
719
08b38d54 720def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
721 """ Return a UNIX timestamp from the given date """
722
723 if date_str is None:
724 return None
725
08b38d54
PH
726 if timezone is None:
727 m = re.search(
728 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
729 date_str)
730 if not m:
912b38b4
PH
731 timezone = datetime.timedelta()
732 else:
08b38d54
PH
733 date_str = date_str[:-len(m.group(0))]
734 if not m.group('sign'):
735 timezone = datetime.timedelta()
736 else:
737 sign = 1 if m.group('sign') == '+' else -1
738 timezone = datetime.timedelta(
739 hours=sign * int(m.group('hours')),
740 minutes=sign * int(m.group('minutes')))
6ad4013d 741 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 742 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
743 return calendar.timegm(dt.timetuple())
744
745
42bdd9d0 746def unified_strdate(date_str, day_first=True):
bf50b038 747 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
748
749 if date_str is None:
750 return None
bf50b038 751 upload_date = None
5f6a1245 752 # Replace commas
026fcc04 753 date_str = date_str.replace(',', ' ')
bf50b038 754 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
755 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
756 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 757 # Remove AM/PM + timezone
9bb8e0a3 758 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 759
19e1d359
JMF
760 format_expressions = [
761 '%d %B %Y',
0f99566c 762 '%d %b %Y',
19e1d359
JMF
763 '%B %d %Y',
764 '%b %d %Y',
78ff59d0
PP
765 '%b %dst %Y %I:%M%p',
766 '%b %dnd %Y %I:%M%p',
767 '%b %dth %Y %I:%M%p',
a69801e2 768 '%Y %m %d',
19e1d359 769 '%Y-%m-%d',
fe556f1b 770 '%Y/%m/%d',
19e1d359 771 '%Y/%m/%d %H:%M:%S',
5d73273f 772 '%Y-%m-%d %H:%M:%S',
e9be9a6a 773 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 774 '%d.%m.%Y %H:%M',
b047de6f 775 '%d.%m.%Y %H.%M',
19e1d359 776 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
777 '%Y-%m-%dT%H:%M:%S.%fZ',
778 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 779 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 780 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 781 '%Y-%m-%dT%H:%M',
19e1d359 782 ]
42bdd9d0
PH
783 if day_first:
784 format_expressions.extend([
79c21abb 785 '%d-%m-%Y',
776dc399
S
786 '%d.%m.%Y',
787 '%d/%m/%Y',
788 '%d/%m/%y',
42bdd9d0
PH
789 '%d/%m/%Y %H:%M:%S',
790 ])
791 else:
792 format_expressions.extend([
79c21abb 793 '%m-%d-%Y',
776dc399
S
794 '%m.%d.%Y',
795 '%m/%d/%Y',
796 '%m/%d/%y',
42bdd9d0
PH
797 '%m/%d/%Y %H:%M:%S',
798 ])
bf50b038
JMF
799 for expression in format_expressions:
800 try:
801 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 802 except ValueError:
bf50b038 803 pass
42393ce2
PH
804 if upload_date is None:
805 timetuple = email.utils.parsedate_tz(date_str)
806 if timetuple:
807 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
808 return upload_date
809
5f6a1245 810
28e614de 811def determine_ext(url, default_ext='unknown_video'):
f4776371
S
812 if url is None:
813 return default_ext
28e614de 814 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
815 if re.match(r'^[A-Za-z0-9]+$', guess):
816 return guess
817 else:
cbdbb766 818 return default_ext
73e79f2a 819
5f6a1245 820
d4051a8e 821def subtitles_filename(filename, sub_lang, sub_format):
28e614de 822 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 823
5f6a1245 824
bd558525 825def date_from_str(date_str):
37254abc
JMF
826 """
827 Return a datetime object from a string in the format YYYYMMDD or
828 (now|today)[+-][0-9](day|week|month|year)(s)?"""
829 today = datetime.date.today()
f8795e10 830 if date_str in ('now', 'today'):
37254abc 831 return today
f8795e10
PH
832 if date_str == 'yesterday':
833 return today - datetime.timedelta(days=1)
37254abc
JMF
834 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
835 if match is not None:
836 sign = match.group('sign')
837 time = int(match.group('time'))
838 if sign == '-':
839 time = -time
840 unit = match.group('unit')
5f6a1245 841 # A bad aproximation?
37254abc
JMF
842 if unit == 'month':
843 unit = 'day'
844 time *= 30
845 elif unit == 'year':
846 unit = 'day'
847 time *= 365
848 unit += 's'
849 delta = datetime.timedelta(**{unit: time})
850 return today + delta
bd558525 851 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
852
853
e63fc1be 854def hyphenate_date(date_str):
855 """
856 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
857 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
858 if match is not None:
859 return '-'.join(match.groups())
860 else:
861 return date_str
862
5f6a1245 863
bd558525
JMF
864class DateRange(object):
865 """Represents a time interval between two dates"""
5f6a1245 866
bd558525
JMF
867 def __init__(self, start=None, end=None):
868 """start and end must be strings in the format accepted by date"""
869 if start is not None:
870 self.start = date_from_str(start)
871 else:
872 self.start = datetime.datetime.min.date()
873 if end is not None:
874 self.end = date_from_str(end)
875 else:
876 self.end = datetime.datetime.max.date()
37254abc 877 if self.start > self.end:
bd558525 878 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 879
bd558525
JMF
880 @classmethod
881 def day(cls, day):
882 """Returns a range that only contains the given day"""
5f6a1245
JW
883 return cls(day, day)
884
bd558525
JMF
885 def __contains__(self, date):
886 """Check if the date is in the range"""
37254abc
JMF
887 if not isinstance(date, datetime.date):
888 date = date_from_str(date)
889 return self.start <= date <= self.end
5f6a1245 890
bd558525 891 def __str__(self):
5f6a1245 892 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
893
894
895def platform_name():
896 """ Returns the platform name as a compat_str """
897 res = platform.platform()
898 if isinstance(res, bytes):
899 res = res.decode(preferredencoding())
900
901 assert isinstance(res, compat_str)
902 return res
c257baff
PH
903
904
b58ddb32
PH
905def _windows_write_string(s, out):
906 """ Returns True if the string was written using special methods,
907 False if it has yet to be written out."""
908 # Adapted from http://stackoverflow.com/a/3259271/35070
909
910 import ctypes
911 import ctypes.wintypes
912
913 WIN_OUTPUT_IDS = {
914 1: -11,
915 2: -12,
916 }
917
a383a98a
PH
918 try:
919 fileno = out.fileno()
920 except AttributeError:
921 # If the output stream doesn't have a fileno, it's virtual
922 return False
aa42e873
PH
923 except io.UnsupportedOperation:
924 # Some strange Windows pseudo files?
925 return False
b58ddb32
PH
926 if fileno not in WIN_OUTPUT_IDS:
927 return False
928
e2f89ec7 929 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 930 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 931 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
932 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
933
e2f89ec7 934 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
935 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
936 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 937 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
938 written = ctypes.wintypes.DWORD(0)
939
6ac4e806 940 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
941 FILE_TYPE_CHAR = 0x0002
942 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 943 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
944 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
945 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 946 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
947 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
948
949 def not_a_console(handle):
950 if handle == INVALID_HANDLE_VALUE or handle is None:
951 return True
8fb3ac36
PH
952 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
953 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
954
955 if not_a_console(h):
956 return False
957
d1b9c912
PH
958 def next_nonbmp_pos(s):
959 try:
960 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
961 except StopIteration:
962 return len(s)
963
964 while s:
965 count = min(next_nonbmp_pos(s), 1024)
966
b58ddb32 967 ret = WriteConsoleW(
d1b9c912 968 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
969 if ret == 0:
970 raise OSError('Failed to write string')
d1b9c912
PH
971 if not count: # We just wrote a non-BMP character
972 assert written.value == 2
973 s = s[1:]
974 else:
975 assert written.value > 0
976 s = s[written.value:]
b58ddb32
PH
977 return True
978
979
734f90bb 980def write_string(s, out=None, encoding=None):
7459e3a2
PH
981 if out is None:
982 out = sys.stderr
8bf48f23 983 assert type(s) == compat_str
7459e3a2 984
b58ddb32
PH
985 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
986 if _windows_write_string(s, out):
987 return
988
7459e3a2
PH
989 if ('b' in getattr(out, 'mode', '') or
990 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
991 byt = s.encode(encoding or preferredencoding(), 'ignore')
992 out.write(byt)
993 elif hasattr(out, 'buffer'):
994 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
995 byt = s.encode(enc, 'ignore')
996 out.buffer.write(byt)
997 else:
8bf48f23 998 out.write(s)
7459e3a2
PH
999 out.flush()
1000
1001
48ea9cea
PH
1002def bytes_to_intlist(bs):
1003 if not bs:
1004 return []
1005 if isinstance(bs[0], int): # Python 3
1006 return list(bs)
1007 else:
1008 return [ord(c) for c in bs]
1009
c257baff 1010
cba892fa 1011def intlist_to_bytes(xs):
1012 if not xs:
1013 return b''
eb4157fd 1014 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1015
1016
c1c9a79c
PH
1017# Cross-platform file locking
1018if sys.platform == 'win32':
1019 import ctypes.wintypes
1020 import msvcrt
1021
1022 class OVERLAPPED(ctypes.Structure):
1023 _fields_ = [
1024 ('Internal', ctypes.wintypes.LPVOID),
1025 ('InternalHigh', ctypes.wintypes.LPVOID),
1026 ('Offset', ctypes.wintypes.DWORD),
1027 ('OffsetHigh', ctypes.wintypes.DWORD),
1028 ('hEvent', ctypes.wintypes.HANDLE),
1029 ]
1030
1031 kernel32 = ctypes.windll.kernel32
1032 LockFileEx = kernel32.LockFileEx
1033 LockFileEx.argtypes = [
1034 ctypes.wintypes.HANDLE, # hFile
1035 ctypes.wintypes.DWORD, # dwFlags
1036 ctypes.wintypes.DWORD, # dwReserved
1037 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1038 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1039 ctypes.POINTER(OVERLAPPED) # Overlapped
1040 ]
1041 LockFileEx.restype = ctypes.wintypes.BOOL
1042 UnlockFileEx = kernel32.UnlockFileEx
1043 UnlockFileEx.argtypes = [
1044 ctypes.wintypes.HANDLE, # hFile
1045 ctypes.wintypes.DWORD, # dwReserved
1046 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1047 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1048 ctypes.POINTER(OVERLAPPED) # Overlapped
1049 ]
1050 UnlockFileEx.restype = ctypes.wintypes.BOOL
1051 whole_low = 0xffffffff
1052 whole_high = 0x7fffffff
1053
1054 def _lock_file(f, exclusive):
1055 overlapped = OVERLAPPED()
1056 overlapped.Offset = 0
1057 overlapped.OffsetHigh = 0
1058 overlapped.hEvent = 0
1059 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1060 handle = msvcrt.get_osfhandle(f.fileno())
1061 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1062 whole_low, whole_high, f._lock_file_overlapped_p):
1063 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1064
1065 def _unlock_file(f):
1066 assert f._lock_file_overlapped_p
1067 handle = msvcrt.get_osfhandle(f.fileno())
1068 if not UnlockFileEx(handle, 0,
1069 whole_low, whole_high, f._lock_file_overlapped_p):
1070 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1071
1072else:
1073 import fcntl
1074
1075 def _lock_file(f, exclusive):
2582bebe 1076 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1077
1078 def _unlock_file(f):
2582bebe 1079 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1080
1081
1082class locked_file(object):
1083 def __init__(self, filename, mode, encoding=None):
1084 assert mode in ['r', 'a', 'w']
1085 self.f = io.open(filename, mode, encoding=encoding)
1086 self.mode = mode
1087
1088 def __enter__(self):
1089 exclusive = self.mode != 'r'
1090 try:
1091 _lock_file(self.f, exclusive)
1092 except IOError:
1093 self.f.close()
1094 raise
1095 return self
1096
1097 def __exit__(self, etype, value, traceback):
1098 try:
1099 _unlock_file(self.f)
1100 finally:
1101 self.f.close()
1102
1103 def __iter__(self):
1104 return iter(self.f)
1105
1106 def write(self, *args):
1107 return self.f.write(*args)
1108
1109 def read(self, *args):
1110 return self.f.read(*args)
4eb7f1d1
JMF
1111
1112
4644ac55
S
1113def get_filesystem_encoding():
1114 encoding = sys.getfilesystemencoding()
1115 return encoding if encoding is not None else 'utf-8'
1116
1117
4eb7f1d1 1118def shell_quote(args):
a6a173c2 1119 quoted_args = []
4644ac55 1120 encoding = get_filesystem_encoding()
a6a173c2
JMF
1121 for a in args:
1122 if isinstance(a, bytes):
1123 # We may get a filename encoded with 'encodeFilename'
1124 a = a.decode(encoding)
1125 quoted_args.append(pipes.quote(a))
28e614de 1126 return ' '.join(quoted_args)
9d4660ca
PH
1127
1128
1129def smuggle_url(url, data):
1130 """ Pass additional data in a URL for internal use. """
1131
1132 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1133 {'__youtubedl_smuggle': json.dumps(data)})
1134 return url + '#' + sdata
9d4660ca
PH
1135
1136
79f82953 1137def unsmuggle_url(smug_url, default=None):
83e865a3 1138 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1139 return smug_url, default
28e614de
PH
1140 url, _, sdata = smug_url.rpartition('#')
1141 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1142 data = json.loads(jsond)
1143 return url, data
02dbf93f
PH
1144
1145
02dbf93f
PH
1146def format_bytes(bytes):
1147 if bytes is None:
28e614de 1148 return 'N/A'
02dbf93f
PH
1149 if type(bytes) is str:
1150 bytes = float(bytes)
1151 if bytes == 0.0:
1152 exponent = 0
1153 else:
1154 exponent = int(math.log(bytes, 1024.0))
28e614de 1155 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1156 converted = float(bytes) / float(1024 ** exponent)
28e614de 1157 return '%.2f%s' % (converted, suffix)
f53c966a 1158
1c088fa8 1159
be64b5b0
PH
1160def parse_filesize(s):
1161 if s is None:
1162 return None
1163
1164 # The lower-case forms are of course incorrect and inofficial,
1165 # but we support those too
1166 _UNIT_TABLE = {
1167 'B': 1,
1168 'b': 1,
1169 'KiB': 1024,
1170 'KB': 1000,
1171 'kB': 1024,
1172 'Kb': 1000,
1173 'MiB': 1024 ** 2,
1174 'MB': 1000 ** 2,
1175 'mB': 1024 ** 2,
1176 'Mb': 1000 ** 2,
1177 'GiB': 1024 ** 3,
1178 'GB': 1000 ** 3,
1179 'gB': 1024 ** 3,
1180 'Gb': 1000 ** 3,
1181 'TiB': 1024 ** 4,
1182 'TB': 1000 ** 4,
1183 'tB': 1024 ** 4,
1184 'Tb': 1000 ** 4,
1185 'PiB': 1024 ** 5,
1186 'PB': 1000 ** 5,
1187 'pB': 1024 ** 5,
1188 'Pb': 1000 ** 5,
1189 'EiB': 1024 ** 6,
1190 'EB': 1000 ** 6,
1191 'eB': 1024 ** 6,
1192 'Eb': 1000 ** 6,
1193 'ZiB': 1024 ** 7,
1194 'ZB': 1000 ** 7,
1195 'zB': 1024 ** 7,
1196 'Zb': 1000 ** 7,
1197 'YiB': 1024 ** 8,
1198 'YB': 1000 ** 8,
1199 'yB': 1024 ** 8,
1200 'Yb': 1000 ** 8,
1201 }
1202
1203 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1204 m = re.match(
1205 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1206 if not m:
1207 return None
1208
4349c07d
PH
1209 num_str = m.group('num').replace(',', '.')
1210 mult = _UNIT_TABLE[m.group('unit')]
1211 return int(float(num_str) * mult)
be64b5b0
PH
1212
1213
caefb1de
PH
1214def month_by_name(name):
1215 """ Return the number of a month by (locale-independently) English name """
1216
caefb1de 1217 try:
7105440c
YCH
1218 return ENGLISH_MONTH_NAMES.index(name) + 1
1219 except ValueError:
1220 return None
1221
1222
1223def month_by_abbreviation(abbrev):
1224 """ Return the number of a month by (locale-independently) English
1225 abbreviations """
1226
1227 try:
1228 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1229 except ValueError:
1230 return None
18258362
JMF
1231
1232
5aafe895 1233def fix_xml_ampersands(xml_str):
18258362 1234 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1235 return re.sub(
1236 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1237 '&amp;',
5aafe895 1238 xml_str)
e3946f98
PH
1239
1240
1241def setproctitle(title):
8bf48f23 1242 assert isinstance(title, compat_str)
e3946f98
PH
1243 try:
1244 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245 except OSError:
1246 return
6eefe533
PH
1247 title_bytes = title.encode('utf-8')
1248 buf = ctypes.create_string_buffer(len(title_bytes))
1249 buf.value = title_bytes
e3946f98 1250 try:
6eefe533 1251 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1252 except AttributeError:
1253 return # Strange libc, just skip this
d7dda168
PH
1254
1255
1256def remove_start(s, start):
1257 if s.startswith(start):
1258 return s[len(start):]
1259 return s
29eb5174
PH
1260
1261
2b9faf55
PH
1262def remove_end(s, end):
1263 if s.endswith(end):
1264 return s[:-len(end)]
1265 return s
1266
1267
29eb5174 1268def url_basename(url):
9b8aaeed 1269 path = compat_urlparse.urlparse(url).path
28e614de 1270 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1271
1272
1273class HEADRequest(compat_urllib_request.Request):
1274 def get_method(self):
1275 return "HEAD"
7217e148
PH
1276
1277
9732d77e 1278def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1279 if get_attr:
1280 if v is not None:
1281 v = getattr(v, get_attr, None)
9572013d
PH
1282 if v == '':
1283 v = None
9732d77e
PH
1284 return default if v is None else (int(v) * invscale // scale)
1285
9572013d 1286
40a90862
JMF
1287def str_or_none(v, default=None):
1288 return default if v is None else compat_str(v)
1289
9732d77e
PH
1290
1291def str_to_int(int_str):
48d4681e 1292 """ A more relaxed version of int_or_none """
9732d77e
PH
1293 if int_str is None:
1294 return None
28e614de 1295 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1296 return int(int_str)
608d11f5
PH
1297
1298
9732d77e
PH
1299def float_or_none(v, scale=1, invscale=1, default=None):
1300 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1301
1302
608d11f5 1303def parse_duration(s):
8f9312c3 1304 if not isinstance(s, compat_basestring):
608d11f5
PH
1305 return None
1306
ca7b3246
S
1307 s = s.strip()
1308
608d11f5 1309 m = re.match(
9d22a7df 1310 r'''(?ix)(?:P?T)?
e8df5cee 1311 (?:
9c29bc69 1312 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1313 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1314
9c29bc69 1315 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1316 (?:
8f4b58d7
PH
1317 (?:
1318 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1319 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1320 )?
6a68bb57
PH
1321 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1322 )?
e8df5cee
PH
1323 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1324 )$''', s)
608d11f5
PH
1325 if not m:
1326 return None
e8df5cee
PH
1327 res = 0
1328 if m.group('only_mins'):
1329 return float_or_none(m.group('only_mins'), invscale=60)
1330 if m.group('only_hours'):
1331 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1332 if m.group('secs'):
1333 res += int(m.group('secs'))
3e675fab
PH
1334 if m.group('mins_reversed'):
1335 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1336 if m.group('mins'):
1337 res += int(m.group('mins')) * 60
e8df5cee
PH
1338 if m.group('hours'):
1339 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1340 if m.group('hours_reversed'):
1341 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1342 if m.group('days'):
1343 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1344 if m.group('ms'):
1345 res += float(m.group('ms'))
608d11f5 1346 return res
91d7d0b3
JMF
1347
1348
e65e4c88 1349def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1350 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1351 return (
1352 '{0}.{1}{2}'.format(name, ext, real_ext)
1353 if not expected_real_ext or real_ext[1:] == expected_real_ext
1354 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1355
1356
b3ed15b7
S
1357def replace_extension(filename, ext, expected_real_ext=None):
1358 name, real_ext = os.path.splitext(filename)
1359 return '{0}.{1}'.format(
1360 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1361 ext)
1362
1363
d70ad093
PH
1364def check_executable(exe, args=[]):
1365 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1366 args can be a list of arguments for a short output (like -version) """
1367 try:
1368 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1369 except OSError:
1370 return False
1371 return exe
b7ab0590
PH
1372
1373
95807118 1374def get_exe_version(exe, args=['--version'],
cae97f65 1375 version_re=None, unrecognized='present'):
95807118
PH
1376 """ Returns the version of the specified executable,
1377 or False if the executable is not present """
1378 try:
cae97f65 1379 out, _ = subprocess.Popen(
54116803 1380 [encodeArgument(exe)] + args,
95807118
PH
1381 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1382 except OSError:
1383 return False
cae97f65
PH
1384 if isinstance(out, bytes): # Python 2.x
1385 out = out.decode('ascii', 'ignore')
1386 return detect_exe_version(out, version_re, unrecognized)
1387
1388
1389def detect_exe_version(output, version_re=None, unrecognized='present'):
1390 assert isinstance(output, compat_str)
1391 if version_re is None:
1392 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1393 m = re.search(version_re, output)
95807118
PH
1394 if m:
1395 return m.group(1)
1396 else:
1397 return unrecognized
1398
1399
b7ab0590 1400class PagedList(object):
dd26ced1
PH
1401 def __len__(self):
1402 # This is only useful for tests
1403 return len(self.getslice())
1404
9c44d242
PH
1405
1406class OnDemandPagedList(PagedList):
1407 def __init__(self, pagefunc, pagesize):
1408 self._pagefunc = pagefunc
1409 self._pagesize = pagesize
1410
b7ab0590
PH
1411 def getslice(self, start=0, end=None):
1412 res = []
1413 for pagenum in itertools.count(start // self._pagesize):
1414 firstid = pagenum * self._pagesize
1415 nextfirstid = pagenum * self._pagesize + self._pagesize
1416 if start >= nextfirstid:
1417 continue
1418
1419 page_results = list(self._pagefunc(pagenum))
1420
1421 startv = (
1422 start % self._pagesize
1423 if firstid <= start < nextfirstid
1424 else 0)
1425
1426 endv = (
1427 ((end - 1) % self._pagesize) + 1
1428 if (end is not None and firstid <= end <= nextfirstid)
1429 else None)
1430
1431 if startv != 0 or endv is not None:
1432 page_results = page_results[startv:endv]
1433 res.extend(page_results)
1434
1435 # A little optimization - if current page is not "full", ie. does
1436 # not contain page_size videos then we can assume that this page
1437 # is the last one - there are no more ids on further pages -
1438 # i.e. no need to query again.
1439 if len(page_results) + startv < self._pagesize:
1440 break
1441
1442 # If we got the whole page, but the next page is not interesting,
1443 # break out early as well
1444 if end == nextfirstid:
1445 break
1446 return res
81c2f20b
PH
1447
1448
9c44d242
PH
1449class InAdvancePagedList(PagedList):
1450 def __init__(self, pagefunc, pagecount, pagesize):
1451 self._pagefunc = pagefunc
1452 self._pagecount = pagecount
1453 self._pagesize = pagesize
1454
1455 def getslice(self, start=0, end=None):
1456 res = []
1457 start_page = start // self._pagesize
1458 end_page = (
1459 self._pagecount if end is None else (end // self._pagesize + 1))
1460 skip_elems = start - start_page * self._pagesize
1461 only_more = None if end is None else end - start
1462 for pagenum in range(start_page, end_page):
1463 page = list(self._pagefunc(pagenum))
1464 if skip_elems:
1465 page = page[skip_elems:]
1466 skip_elems = None
1467 if only_more is not None:
1468 if len(page) < only_more:
1469 only_more -= len(page)
1470 else:
1471 page = page[:only_more]
1472 res.extend(page)
1473 break
1474 res.extend(page)
1475 return res
1476
1477
81c2f20b 1478def uppercase_escape(s):
676eb3f2 1479 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1480 return re.sub(
a612753d 1481 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1482 lambda m: unicode_escape(m.group(0))[0],
1483 s)
0fe2ff78
YCH
1484
1485
1486def lowercase_escape(s):
1487 unicode_escape = codecs.getdecoder('unicode_escape')
1488 return re.sub(
1489 r'\\u[0-9a-fA-F]{4}',
1490 lambda m: unicode_escape(m.group(0))[0],
1491 s)
b53466e1 1492
d05cfe06
S
1493
1494def escape_rfc3986(s):
1495 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1496 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1497 s = s.encode('utf-8')
ecc0c5ee 1498 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1499
1500
1501def escape_url(url):
1502 """Escape URL as suggested by RFC 3986"""
1503 url_parsed = compat_urllib_parse_urlparse(url)
1504 return url_parsed._replace(
1505 path=escape_rfc3986(url_parsed.path),
1506 params=escape_rfc3986(url_parsed.params),
1507 query=escape_rfc3986(url_parsed.query),
1508 fragment=escape_rfc3986(url_parsed.fragment)
1509 ).geturl()
1510
b53466e1 1511try:
28e614de 1512 struct.pack('!I', 0)
b53466e1
PH
1513except TypeError:
1514 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1515 def struct_pack(spec, *args):
1516 if isinstance(spec, compat_str):
1517 spec = spec.encode('ascii')
1518 return struct.pack(spec, *args)
1519
1520 def struct_unpack(spec, *args):
1521 if isinstance(spec, compat_str):
1522 spec = spec.encode('ascii')
1523 return struct.unpack(spec, *args)
1524else:
1525 struct_pack = struct.pack
1526 struct_unpack = struct.unpack
62e609ab
PH
1527
1528
1529def read_batch_urls(batch_fd):
1530 def fixup(url):
1531 if not isinstance(url, compat_str):
1532 url = url.decode('utf-8', 'replace')
28e614de 1533 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1534 if url.startswith(BOM_UTF8):
1535 url = url[len(BOM_UTF8):]
1536 url = url.strip()
1537 if url.startswith(('#', ';', ']')):
1538 return False
1539 return url
1540
1541 with contextlib.closing(batch_fd) as fd:
1542 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1543
1544
1545def urlencode_postdata(*args, **kargs):
1546 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1547
1548
0990305d
PH
1549try:
1550 etree_iter = xml.etree.ElementTree.Element.iter
1551except AttributeError: # Python <=2.6
1552 etree_iter = lambda n: n.findall('.//*')
1553
1554
bcf89ce6
PH
1555def parse_xml(s):
1556 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1557 def doctype(self, name, pubid, system):
1558 pass # Ignore doctypes
1559
1560 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1561 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1562 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1563 # Fix up XML parser in Python 2.x
1564 if sys.version_info < (3, 0):
1565 for n in etree_iter(tree):
1566 if n.text is not None:
1567 if not isinstance(n.text, compat_str):
1568 n.text = n.text.decode('utf-8')
1569 return tree
e68301af
PH
1570
1571
a1a530b0
PH
1572US_RATINGS = {
1573 'G': 0,
1574 'PG': 10,
1575 'PG-13': 13,
1576 'R': 16,
1577 'NC': 18,
1578}
fac55558
PH
1579
1580
146c80e2
S
1581def parse_age_limit(s):
1582 if s is None:
d838b1bd 1583 return None
146c80e2 1584 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1585 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1586
1587
fac55558 1588def strip_jsonp(code):
609a61e3
PH
1589 return re.sub(
1590 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1591
1592
e05f6939
PH
1593def js_to_json(code):
1594 def fix_kv(m):
e7b6d122
PH
1595 v = m.group(0)
1596 if v in ('true', 'false', 'null'):
1597 return v
1598 if v.startswith('"'):
1599 return v
1600 if v.startswith("'"):
1601 v = v[1:-1]
1602 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1603 '\\\\': '\\\\',
1604 "\\'": "'",
1605 '"': '\\"',
1606 }[m.group(0)], v)
1607 return '"%s"' % v
e05f6939
PH
1608
1609 res = re.sub(r'''(?x)
d305dd73
PH
1610 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1611 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1612 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1613 ''', fix_kv, code)
ba9e68f4 1614 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1615 return res
1616
1617
478c2c61
PH
1618def qualities(quality_ids):
1619 """ Get a numeric quality value out of a list of possible values """
1620 def q(qid):
1621 try:
1622 return quality_ids.index(qid)
1623 except ValueError:
1624 return -1
1625 return q
1626
acd69589
PH
1627
1628DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1629
a020a0dc
PH
1630
1631def limit_length(s, length):
1632 """ Add ellipses to overly long strings """
1633 if s is None:
1634 return None
1635 ELLIPSES = '...'
1636 if len(s) > length:
1637 return s[:length - len(ELLIPSES)] + ELLIPSES
1638 return s
48844745
PH
1639
1640
1641def version_tuple(v):
5f9b8394 1642 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1643
1644
1645def is_outdated_version(version, limit, assume_new=True):
1646 if not version:
1647 return not assume_new
1648 try:
1649 return version_tuple(version) < version_tuple(limit)
1650 except ValueError:
1651 return not assume_new
732ea2f0
PH
1652
1653
1654def ytdl_is_updateable():
1655 """ Returns if youtube-dl can be updated with -U """
1656 from zipimport import zipimporter
1657
1658 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1659
1660
1661def args_to_str(args):
1662 # Get a short string representation for a subprocess command
1663 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1664
1665
c460bdd5
PH
1666def mimetype2ext(mt):
1667 _, _, res = mt.rpartition('/')
1668
1669 return {
1670 'x-ms-wmv': 'wmv',
1671 'x-mp4-fragmented': 'mp4',
ecee5724 1672 'ttml+xml': 'ttml',
c460bdd5
PH
1673 }.get(res, res)
1674
1675
2ccd1b10
PH
1676def urlhandle_detect_ext(url_handle):
1677 try:
1678 url_handle.headers
1679 getheader = lambda h: url_handle.headers[h]
1680 except AttributeError: # Python < 3
1681 getheader = url_handle.info().getheader
1682
b55ee18f
PH
1683 cd = getheader('Content-Disposition')
1684 if cd:
1685 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1686 if m:
1687 e = determine_ext(m.group('filename'), default_ext=None)
1688 if e:
1689 return e
1690
c460bdd5 1691 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1692
1693
1694def age_restricted(content_limit, age_limit):
1695 """ Returns True iff the content should be blocked """
1696
1697 if age_limit is None: # No limit set
1698 return False
1699 if content_limit is None:
1700 return False # Content available for everyone
1701 return age_limit < content_limit
61ca9a80
PH
1702
1703
1704def is_html(first_bytes):
1705 """ Detect whether a file contains HTML by examining its first bytes. """
1706
1707 BOMS = [
1708 (b'\xef\xbb\xbf', 'utf-8'),
1709 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1710 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1711 (b'\xff\xfe', 'utf-16-le'),
1712 (b'\xfe\xff', 'utf-16-be'),
1713 ]
1714 for bom, enc in BOMS:
1715 if first_bytes.startswith(bom):
1716 s = first_bytes[len(bom):].decode(enc, 'replace')
1717 break
1718 else:
1719 s = first_bytes.decode('utf-8', 'replace')
1720
1721 return re.match(r'^\s*<', s)
a055469f
PH
1722
1723
1724def determine_protocol(info_dict):
1725 protocol = info_dict.get('protocol')
1726 if protocol is not None:
1727 return protocol
1728
1729 url = info_dict['url']
1730 if url.startswith('rtmp'):
1731 return 'rtmp'
1732 elif url.startswith('mms'):
1733 return 'mms'
1734 elif url.startswith('rtsp'):
1735 return 'rtsp'
1736
1737 ext = determine_ext(url)
1738 if ext == 'm3u8':
1739 return 'm3u8'
1740 elif ext == 'f4m':
1741 return 'f4m'
1742
1743 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1744
1745
1746def render_table(header_row, data):
1747 """ Render a list of rows, each as a list of values """
1748 table = [header_row] + data
1749 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1750 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1751 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1752
1753
1754def _match_one(filter_part, dct):
1755 COMPARISON_OPERATORS = {
1756 '<': operator.lt,
1757 '<=': operator.le,
1758 '>': operator.gt,
1759 '>=': operator.ge,
1760 '=': operator.eq,
1761 '!=': operator.ne,
1762 }
1763 operator_rex = re.compile(r'''(?x)\s*
1764 (?P<key>[a-z_]+)
1765 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1766 (?:
1767 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1768 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1769 )
1770 \s*$
1771 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1772 m = operator_rex.search(filter_part)
1773 if m:
1774 op = COMPARISON_OPERATORS[m.group('op')]
1775 if m.group('strval') is not None:
1776 if m.group('op') not in ('=', '!='):
1777 raise ValueError(
1778 'Operator %s does not support string values!' % m.group('op'))
1779 comparison_value = m.group('strval')
1780 else:
1781 try:
1782 comparison_value = int(m.group('intval'))
1783 except ValueError:
1784 comparison_value = parse_filesize(m.group('intval'))
1785 if comparison_value is None:
1786 comparison_value = parse_filesize(m.group('intval') + 'B')
1787 if comparison_value is None:
1788 raise ValueError(
1789 'Invalid integer value %r in filter part %r' % (
1790 m.group('intval'), filter_part))
1791 actual_value = dct.get(m.group('key'))
1792 if actual_value is None:
1793 return m.group('none_inclusive')
1794 return op(actual_value, comparison_value)
1795
1796 UNARY_OPERATORS = {
1797 '': lambda v: v is not None,
1798 '!': lambda v: v is None,
1799 }
1800 operator_rex = re.compile(r'''(?x)\s*
1801 (?P<op>%s)\s*(?P<key>[a-z_]+)
1802 \s*$
1803 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1804 m = operator_rex.search(filter_part)
1805 if m:
1806 op = UNARY_OPERATORS[m.group('op')]
1807 actual_value = dct.get(m.group('key'))
1808 return op(actual_value)
1809
1810 raise ValueError('Invalid filter part %r' % filter_part)
1811
1812
1813def match_str(filter_str, dct):
1814 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1815
1816 return all(
1817 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1818
1819
1820def match_filter_func(filter_str):
1821 def _match_func(info_dict):
1822 if match_str(filter_str, info_dict):
1823 return None
1824 else:
1825 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1826 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1827 return _match_func
91410c9b
PH
1828
1829
bf6427d2
YCH
1830def parse_dfxp_time_expr(time_expr):
1831 if not time_expr:
1832 return 0.0
1833
1834 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1835 if mobj:
1836 return float(mobj.group('time_offset'))
1837
1838 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1839 if mobj:
1840 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1841
1842
c1c924ab
YCH
1843def srt_subtitles_timecode(seconds):
1844 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
1845
1846
1847def dfxp2srt(dfxp_data):
4e335771
YCH
1848 _x = functools.partial(xpath_with_ns, ns_map={
1849 'ttml': 'http://www.w3.org/ns/ttml',
1850 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1851 })
bf6427d2
YCH
1852
1853 def parse_node(node):
1854 str_or_empty = functools.partial(str_or_none, default='')
1855
1856 out = str_or_empty(node.text)
1857
1858 for child in node:
4e335771 1859 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
bf6427d2 1860 out += '\n' + str_or_empty(child.tail)
4e335771 1861 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
bf6427d2
YCH
1862 out += str_or_empty(parse_node(child))
1863 else:
1864 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1865
1866 return out
1867
1868 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1869 out = []
4e335771 1870 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
1871
1872 if not paras:
1873 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
1874
1875 for para, index in zip(paras, itertools.count(1)):
7dff0363
YCH
1876 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1877 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1878 if not end_time:
1879 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
bf6427d2
YCH
1880 out.append('%d\n%s --> %s\n%s\n\n' % (
1881 index,
c1c924ab
YCH
1882 srt_subtitles_timecode(begin_time),
1883 srt_subtitles_timecode(end_time),
bf6427d2
YCH
1884 parse_node(para)))
1885
1886 return ''.join(out)
1887
1888
39672624
YCH
1889class ISO639Utils(object):
1890 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1891 _lang_map = {
1892 'aa': 'aar',
1893 'ab': 'abk',
1894 'ae': 'ave',
1895 'af': 'afr',
1896 'ak': 'aka',
1897 'am': 'amh',
1898 'an': 'arg',
1899 'ar': 'ara',
1900 'as': 'asm',
1901 'av': 'ava',
1902 'ay': 'aym',
1903 'az': 'aze',
1904 'ba': 'bak',
1905 'be': 'bel',
1906 'bg': 'bul',
1907 'bh': 'bih',
1908 'bi': 'bis',
1909 'bm': 'bam',
1910 'bn': 'ben',
1911 'bo': 'bod',
1912 'br': 'bre',
1913 'bs': 'bos',
1914 'ca': 'cat',
1915 'ce': 'che',
1916 'ch': 'cha',
1917 'co': 'cos',
1918 'cr': 'cre',
1919 'cs': 'ces',
1920 'cu': 'chu',
1921 'cv': 'chv',
1922 'cy': 'cym',
1923 'da': 'dan',
1924 'de': 'deu',
1925 'dv': 'div',
1926 'dz': 'dzo',
1927 'ee': 'ewe',
1928 'el': 'ell',
1929 'en': 'eng',
1930 'eo': 'epo',
1931 'es': 'spa',
1932 'et': 'est',
1933 'eu': 'eus',
1934 'fa': 'fas',
1935 'ff': 'ful',
1936 'fi': 'fin',
1937 'fj': 'fij',
1938 'fo': 'fao',
1939 'fr': 'fra',
1940 'fy': 'fry',
1941 'ga': 'gle',
1942 'gd': 'gla',
1943 'gl': 'glg',
1944 'gn': 'grn',
1945 'gu': 'guj',
1946 'gv': 'glv',
1947 'ha': 'hau',
1948 'he': 'heb',
1949 'hi': 'hin',
1950 'ho': 'hmo',
1951 'hr': 'hrv',
1952 'ht': 'hat',
1953 'hu': 'hun',
1954 'hy': 'hye',
1955 'hz': 'her',
1956 'ia': 'ina',
1957 'id': 'ind',
1958 'ie': 'ile',
1959 'ig': 'ibo',
1960 'ii': 'iii',
1961 'ik': 'ipk',
1962 'io': 'ido',
1963 'is': 'isl',
1964 'it': 'ita',
1965 'iu': 'iku',
1966 'ja': 'jpn',
1967 'jv': 'jav',
1968 'ka': 'kat',
1969 'kg': 'kon',
1970 'ki': 'kik',
1971 'kj': 'kua',
1972 'kk': 'kaz',
1973 'kl': 'kal',
1974 'km': 'khm',
1975 'kn': 'kan',
1976 'ko': 'kor',
1977 'kr': 'kau',
1978 'ks': 'kas',
1979 'ku': 'kur',
1980 'kv': 'kom',
1981 'kw': 'cor',
1982 'ky': 'kir',
1983 'la': 'lat',
1984 'lb': 'ltz',
1985 'lg': 'lug',
1986 'li': 'lim',
1987 'ln': 'lin',
1988 'lo': 'lao',
1989 'lt': 'lit',
1990 'lu': 'lub',
1991 'lv': 'lav',
1992 'mg': 'mlg',
1993 'mh': 'mah',
1994 'mi': 'mri',
1995 'mk': 'mkd',
1996 'ml': 'mal',
1997 'mn': 'mon',
1998 'mr': 'mar',
1999 'ms': 'msa',
2000 'mt': 'mlt',
2001 'my': 'mya',
2002 'na': 'nau',
2003 'nb': 'nob',
2004 'nd': 'nde',
2005 'ne': 'nep',
2006 'ng': 'ndo',
2007 'nl': 'nld',
2008 'nn': 'nno',
2009 'no': 'nor',
2010 'nr': 'nbl',
2011 'nv': 'nav',
2012 'ny': 'nya',
2013 'oc': 'oci',
2014 'oj': 'oji',
2015 'om': 'orm',
2016 'or': 'ori',
2017 'os': 'oss',
2018 'pa': 'pan',
2019 'pi': 'pli',
2020 'pl': 'pol',
2021 'ps': 'pus',
2022 'pt': 'por',
2023 'qu': 'que',
2024 'rm': 'roh',
2025 'rn': 'run',
2026 'ro': 'ron',
2027 'ru': 'rus',
2028 'rw': 'kin',
2029 'sa': 'san',
2030 'sc': 'srd',
2031 'sd': 'snd',
2032 'se': 'sme',
2033 'sg': 'sag',
2034 'si': 'sin',
2035 'sk': 'slk',
2036 'sl': 'slv',
2037 'sm': 'smo',
2038 'sn': 'sna',
2039 'so': 'som',
2040 'sq': 'sqi',
2041 'sr': 'srp',
2042 'ss': 'ssw',
2043 'st': 'sot',
2044 'su': 'sun',
2045 'sv': 'swe',
2046 'sw': 'swa',
2047 'ta': 'tam',
2048 'te': 'tel',
2049 'tg': 'tgk',
2050 'th': 'tha',
2051 'ti': 'tir',
2052 'tk': 'tuk',
2053 'tl': 'tgl',
2054 'tn': 'tsn',
2055 'to': 'ton',
2056 'tr': 'tur',
2057 'ts': 'tso',
2058 'tt': 'tat',
2059 'tw': 'twi',
2060 'ty': 'tah',
2061 'ug': 'uig',
2062 'uk': 'ukr',
2063 'ur': 'urd',
2064 'uz': 'uzb',
2065 've': 'ven',
2066 'vi': 'vie',
2067 'vo': 'vol',
2068 'wa': 'wln',
2069 'wo': 'wol',
2070 'xh': 'xho',
2071 'yi': 'yid',
2072 'yo': 'yor',
2073 'za': 'zha',
2074 'zh': 'zho',
2075 'zu': 'zul',
2076 }
2077
2078 @classmethod
2079 def short2long(cls, code):
2080 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2081 return cls._lang_map.get(code[:2])
2082
2083 @classmethod
2084 def long2short(cls, code):
2085 """Convert language code from ISO 639-2/T to ISO 639-1"""
2086 for short_name, long_name in cls._lang_map.items():
2087 if long_name == code:
2088 return short_name
2089
2090
4eb10f66
YCH
2091class ISO3166Utils(object):
2092 # From http://data.okfn.org/data/core/country-list
2093 _country_map = {
2094 'AF': 'Afghanistan',
2095 'AX': 'Åland Islands',
2096 'AL': 'Albania',
2097 'DZ': 'Algeria',
2098 'AS': 'American Samoa',
2099 'AD': 'Andorra',
2100 'AO': 'Angola',
2101 'AI': 'Anguilla',
2102 'AQ': 'Antarctica',
2103 'AG': 'Antigua and Barbuda',
2104 'AR': 'Argentina',
2105 'AM': 'Armenia',
2106 'AW': 'Aruba',
2107 'AU': 'Australia',
2108 'AT': 'Austria',
2109 'AZ': 'Azerbaijan',
2110 'BS': 'Bahamas',
2111 'BH': 'Bahrain',
2112 'BD': 'Bangladesh',
2113 'BB': 'Barbados',
2114 'BY': 'Belarus',
2115 'BE': 'Belgium',
2116 'BZ': 'Belize',
2117 'BJ': 'Benin',
2118 'BM': 'Bermuda',
2119 'BT': 'Bhutan',
2120 'BO': 'Bolivia, Plurinational State of',
2121 'BQ': 'Bonaire, Sint Eustatius and Saba',
2122 'BA': 'Bosnia and Herzegovina',
2123 'BW': 'Botswana',
2124 'BV': 'Bouvet Island',
2125 'BR': 'Brazil',
2126 'IO': 'British Indian Ocean Territory',
2127 'BN': 'Brunei Darussalam',
2128 'BG': 'Bulgaria',
2129 'BF': 'Burkina Faso',
2130 'BI': 'Burundi',
2131 'KH': 'Cambodia',
2132 'CM': 'Cameroon',
2133 'CA': 'Canada',
2134 'CV': 'Cape Verde',
2135 'KY': 'Cayman Islands',
2136 'CF': 'Central African Republic',
2137 'TD': 'Chad',
2138 'CL': 'Chile',
2139 'CN': 'China',
2140 'CX': 'Christmas Island',
2141 'CC': 'Cocos (Keeling) Islands',
2142 'CO': 'Colombia',
2143 'KM': 'Comoros',
2144 'CG': 'Congo',
2145 'CD': 'Congo, the Democratic Republic of the',
2146 'CK': 'Cook Islands',
2147 'CR': 'Costa Rica',
2148 'CI': 'Côte d\'Ivoire',
2149 'HR': 'Croatia',
2150 'CU': 'Cuba',
2151 'CW': 'Curaçao',
2152 'CY': 'Cyprus',
2153 'CZ': 'Czech Republic',
2154 'DK': 'Denmark',
2155 'DJ': 'Djibouti',
2156 'DM': 'Dominica',
2157 'DO': 'Dominican Republic',
2158 'EC': 'Ecuador',
2159 'EG': 'Egypt',
2160 'SV': 'El Salvador',
2161 'GQ': 'Equatorial Guinea',
2162 'ER': 'Eritrea',
2163 'EE': 'Estonia',
2164 'ET': 'Ethiopia',
2165 'FK': 'Falkland Islands (Malvinas)',
2166 'FO': 'Faroe Islands',
2167 'FJ': 'Fiji',
2168 'FI': 'Finland',
2169 'FR': 'France',
2170 'GF': 'French Guiana',
2171 'PF': 'French Polynesia',
2172 'TF': 'French Southern Territories',
2173 'GA': 'Gabon',
2174 'GM': 'Gambia',
2175 'GE': 'Georgia',
2176 'DE': 'Germany',
2177 'GH': 'Ghana',
2178 'GI': 'Gibraltar',
2179 'GR': 'Greece',
2180 'GL': 'Greenland',
2181 'GD': 'Grenada',
2182 'GP': 'Guadeloupe',
2183 'GU': 'Guam',
2184 'GT': 'Guatemala',
2185 'GG': 'Guernsey',
2186 'GN': 'Guinea',
2187 'GW': 'Guinea-Bissau',
2188 'GY': 'Guyana',
2189 'HT': 'Haiti',
2190 'HM': 'Heard Island and McDonald Islands',
2191 'VA': 'Holy See (Vatican City State)',
2192 'HN': 'Honduras',
2193 'HK': 'Hong Kong',
2194 'HU': 'Hungary',
2195 'IS': 'Iceland',
2196 'IN': 'India',
2197 'ID': 'Indonesia',
2198 'IR': 'Iran, Islamic Republic of',
2199 'IQ': 'Iraq',
2200 'IE': 'Ireland',
2201 'IM': 'Isle of Man',
2202 'IL': 'Israel',
2203 'IT': 'Italy',
2204 'JM': 'Jamaica',
2205 'JP': 'Japan',
2206 'JE': 'Jersey',
2207 'JO': 'Jordan',
2208 'KZ': 'Kazakhstan',
2209 'KE': 'Kenya',
2210 'KI': 'Kiribati',
2211 'KP': 'Korea, Democratic People\'s Republic of',
2212 'KR': 'Korea, Republic of',
2213 'KW': 'Kuwait',
2214 'KG': 'Kyrgyzstan',
2215 'LA': 'Lao People\'s Democratic Republic',
2216 'LV': 'Latvia',
2217 'LB': 'Lebanon',
2218 'LS': 'Lesotho',
2219 'LR': 'Liberia',
2220 'LY': 'Libya',
2221 'LI': 'Liechtenstein',
2222 'LT': 'Lithuania',
2223 'LU': 'Luxembourg',
2224 'MO': 'Macao',
2225 'MK': 'Macedonia, the Former Yugoslav Republic of',
2226 'MG': 'Madagascar',
2227 'MW': 'Malawi',
2228 'MY': 'Malaysia',
2229 'MV': 'Maldives',
2230 'ML': 'Mali',
2231 'MT': 'Malta',
2232 'MH': 'Marshall Islands',
2233 'MQ': 'Martinique',
2234 'MR': 'Mauritania',
2235 'MU': 'Mauritius',
2236 'YT': 'Mayotte',
2237 'MX': 'Mexico',
2238 'FM': 'Micronesia, Federated States of',
2239 'MD': 'Moldova, Republic of',
2240 'MC': 'Monaco',
2241 'MN': 'Mongolia',
2242 'ME': 'Montenegro',
2243 'MS': 'Montserrat',
2244 'MA': 'Morocco',
2245 'MZ': 'Mozambique',
2246 'MM': 'Myanmar',
2247 'NA': 'Namibia',
2248 'NR': 'Nauru',
2249 'NP': 'Nepal',
2250 'NL': 'Netherlands',
2251 'NC': 'New Caledonia',
2252 'NZ': 'New Zealand',
2253 'NI': 'Nicaragua',
2254 'NE': 'Niger',
2255 'NG': 'Nigeria',
2256 'NU': 'Niue',
2257 'NF': 'Norfolk Island',
2258 'MP': 'Northern Mariana Islands',
2259 'NO': 'Norway',
2260 'OM': 'Oman',
2261 'PK': 'Pakistan',
2262 'PW': 'Palau',
2263 'PS': 'Palestine, State of',
2264 'PA': 'Panama',
2265 'PG': 'Papua New Guinea',
2266 'PY': 'Paraguay',
2267 'PE': 'Peru',
2268 'PH': 'Philippines',
2269 'PN': 'Pitcairn',
2270 'PL': 'Poland',
2271 'PT': 'Portugal',
2272 'PR': 'Puerto Rico',
2273 'QA': 'Qatar',
2274 'RE': 'Réunion',
2275 'RO': 'Romania',
2276 'RU': 'Russian Federation',
2277 'RW': 'Rwanda',
2278 'BL': 'Saint Barthélemy',
2279 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2280 'KN': 'Saint Kitts and Nevis',
2281 'LC': 'Saint Lucia',
2282 'MF': 'Saint Martin (French part)',
2283 'PM': 'Saint Pierre and Miquelon',
2284 'VC': 'Saint Vincent and the Grenadines',
2285 'WS': 'Samoa',
2286 'SM': 'San Marino',
2287 'ST': 'Sao Tome and Principe',
2288 'SA': 'Saudi Arabia',
2289 'SN': 'Senegal',
2290 'RS': 'Serbia',
2291 'SC': 'Seychelles',
2292 'SL': 'Sierra Leone',
2293 'SG': 'Singapore',
2294 'SX': 'Sint Maarten (Dutch part)',
2295 'SK': 'Slovakia',
2296 'SI': 'Slovenia',
2297 'SB': 'Solomon Islands',
2298 'SO': 'Somalia',
2299 'ZA': 'South Africa',
2300 'GS': 'South Georgia and the South Sandwich Islands',
2301 'SS': 'South Sudan',
2302 'ES': 'Spain',
2303 'LK': 'Sri Lanka',
2304 'SD': 'Sudan',
2305 'SR': 'Suriname',
2306 'SJ': 'Svalbard and Jan Mayen',
2307 'SZ': 'Swaziland',
2308 'SE': 'Sweden',
2309 'CH': 'Switzerland',
2310 'SY': 'Syrian Arab Republic',
2311 'TW': 'Taiwan, Province of China',
2312 'TJ': 'Tajikistan',
2313 'TZ': 'Tanzania, United Republic of',
2314 'TH': 'Thailand',
2315 'TL': 'Timor-Leste',
2316 'TG': 'Togo',
2317 'TK': 'Tokelau',
2318 'TO': 'Tonga',
2319 'TT': 'Trinidad and Tobago',
2320 'TN': 'Tunisia',
2321 'TR': 'Turkey',
2322 'TM': 'Turkmenistan',
2323 'TC': 'Turks and Caicos Islands',
2324 'TV': 'Tuvalu',
2325 'UG': 'Uganda',
2326 'UA': 'Ukraine',
2327 'AE': 'United Arab Emirates',
2328 'GB': 'United Kingdom',
2329 'US': 'United States',
2330 'UM': 'United States Minor Outlying Islands',
2331 'UY': 'Uruguay',
2332 'UZ': 'Uzbekistan',
2333 'VU': 'Vanuatu',
2334 'VE': 'Venezuela, Bolivarian Republic of',
2335 'VN': 'Viet Nam',
2336 'VG': 'Virgin Islands, British',
2337 'VI': 'Virgin Islands, U.S.',
2338 'WF': 'Wallis and Futuna',
2339 'EH': 'Western Sahara',
2340 'YE': 'Yemen',
2341 'ZM': 'Zambia',
2342 'ZW': 'Zimbabwe',
2343 }
2344
2345 @classmethod
2346 def short2full(cls, code):
2347 """Convert an ISO 3166-2 country code to the corresponding full name"""
2348 return cls._country_map.get(code.upper())
2349
2350
91410c9b 2351class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2352 def __init__(self, proxies=None):
2353 # Set default handlers
2354 for type in ('http', 'https'):
2355 setattr(self, '%s_open' % type,
2356 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2357 meth(r, proxy, type))
2358 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2359
91410c9b 2360 def proxy_open(self, req, proxy, type):
2461f79d 2361 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2362 if req_proxy is not None:
2363 proxy = req_proxy
2461f79d
PH
2364 del req.headers['Ytdl-request-proxy']
2365
2366 if proxy == '__noproxy__':
2367 return None # No Proxy
91410c9b
PH
2368 return compat_urllib_request.ProxyHandler.proxy_open(
2369 self, req, proxy, type)