]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
use mimetype2ext to determine manifest ext in multiple extractors
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
702ccf2d 46 compat_shlex_quote,
be4a824d 47 compat_socket_create_connection,
8c25f81b 48 compat_str,
edaa23f8 49 compat_struct_pack,
8c25f81b
PH
50 compat_urllib_error,
51 compat_urllib_parse,
15707c7e 52 compat_urllib_parse_urlencode,
8c25f81b 53 compat_urllib_parse_urlparse,
7581bfc9 54 compat_urllib_parse_unquote_plus,
8c25f81b
PH
55 compat_urllib_request,
56 compat_urlparse,
810c10ba 57 compat_xpath,
8c25f81b 58)
4644ac55 59
71aff188
YCH
60from .socks import (
61 ProxyType,
62 sockssocket,
63)
64
4644ac55 65
51fb4995
YCH
66def register_socks_protocols():
67 # "Register" SOCKS protocols
d5ae6bb5
YCH
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
468e2e92
FV
75# This is not clearly defined otherwise
76compiled_regex_type = type(re.compile(''))
77
3e669f36 78std_headers = {
15d10678 79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 84}
f427df17 85
5f6a1245 86
bf42a990
S
87NO_DEFAULT = object()
88
7105440c
YCH
89ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
a7aaa398
S
93KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
c587cbb7 108# needed for sanitizing filenames in restricted mode
c8827027 109ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 112
46f59e89
S
113DATE_FORMATS = (
114 '%d %B %Y',
115 '%d %b %Y',
116 '%B %d %Y',
117 '%b %d %Y',
118 '%b %dst %Y %I:%M',
119 '%b %dnd %Y %I:%M',
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
123 '%Y/%m/%d',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y-%m-%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S.%f',
127 '%d.%m.%Y %H:%M',
128 '%d.%m.%Y %H.%M',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
132 '%Y-%m-%dT%H:%M:%S',
133 '%Y-%m-%dT%H:%M:%S.%f',
134 '%Y-%m-%dT%H:%M',
135)
136
137DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138DATE_FORMATS_DAY_FIRST.extend([
139 '%d-%m-%Y',
140 '%d.%m.%Y',
141 '%d.%m.%y',
142 '%d/%m/%Y',
143 '%d/%m/%y',
144 '%d/%m/%Y %H:%M:%S',
145])
146
147DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_MONTH_FIRST.extend([
149 '%m-%d-%Y',
150 '%m.%d.%Y',
151 '%m/%d/%Y',
152 '%m/%d/%y',
153 '%m/%d/%Y %H:%M:%S',
154])
155
7105440c 156
d77c3dfd 157def preferredencoding():
59ae15a5 158 """Get preferred encoding.
d77c3dfd 159
59ae15a5
PH
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
28e614de 165 'TEST'.encode(pref)
70a1165b 166 except Exception:
59ae15a5 167 pref = 'UTF-8'
bae611f2 168
59ae15a5 169 return pref
d77c3dfd 170
f4bfd65f 171
181c8655 172def write_json_file(obj, fn):
1394646a 173 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 174
92120217 175 fn = encodeFilename(fn)
61ee5aeb 176 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
184 else:
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
187
73159f99
S
188 args = {
189 'suffix': '.tmp',
ec5f6016
JMF
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
73159f99
S
192 'delete': False,
193 }
194
181c8655
PH
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
73159f99 198 args['mode'] = 'wb'
181c8655 199 else:
73159f99
S
200 args.update({
201 'mode': 'w',
202 'encoding': 'utf-8',
203 })
204
c86b6142 205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
206
207 try:
208 with tf:
209 json.dump(obj, tf)
1394646a
IK
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
213 try:
214 os.unlink(fn)
215 except OSError:
216 pass
181c8655 217 os.rename(tf.name, fn)
70a1165b 218 except Exception:
181c8655
PH
219 try:
220 os.remove(tf.name)
221 except OSError:
222 pass
223 raise
224
225
226if sys.version_info >= (2, 7):
ee114368 227 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 228 """ Find the xpath xpath[@key=val] """
5d2354f1 229 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
231 return node.find(expr)
232else:
ee114368 233 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 234 for f in node.findall(compat_xpath(xpath)):
ee114368
S
235 if key not in f.attrib:
236 continue
237 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
238 return f
239 return None
240
d7e66d39
JMF
241# On python2.6 the xml.etree.ElementTree.Element methods don't support
242# the namespace parameter
5f6a1245
JW
243
244
d7e66d39
JMF
245def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
247 replaced = []
248 for c in components:
249 if len(c) == 1:
250 replaced.append(c[0])
251 else:
252 ns, tag = c
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
255
d77c3dfd 256
a41fb80c 257def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 258 def _find_xpath(xpath):
810c10ba 259 return node.find(compat_xpath(xpath))
578c0745
S
260
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
263 else:
264 for xp in xpath:
265 n = _find_xpath(xp)
266 if n is not None:
267 break
d74bebd5 268
8e636da4 269 if n is None:
bf42a990
S
270 if default is not NO_DEFAULT:
271 return default
272 elif fatal:
bf0ff932
PH
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
275 else:
276 return None
a41fb80c
S
277 return n
278
279
280def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
283 return n
284 if n.text is None:
285 if default is not NO_DEFAULT:
286 return default
287 elif fatal:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
290 else:
291 return None
292 return n.text
a41fb80c
S
293
294
295def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
297 if n is None:
298 if default is not NO_DEFAULT:
299 return default
300 elif fatal:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
303 else:
304 return None
305 return n.attrib[key]
bf0ff932
PH
306
307
9e6dd238 308def get_element_by_id(id, html):
43e8fafd 309 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 310 return get_element_by_attribute('id', id, html)
43e8fafd 311
12ea2f30 312
43e8fafd
ND
313def get_element_by_attribute(attribute, value, html):
314 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 315
38285056
PH
316 m = re.search(r'''(?xs)
317 <([a-zA-Z0-9:._-]+)
abc97b5e 318 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 319 \s+%s=['"]?%s['"]?
abc97b5e 320 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
321 \s*>
322 (?P<content>.*?)
323 </\1>
324 ''' % (re.escape(attribute), re.escape(value)), html)
325
326 if not m:
327 return None
328 res = m.group('content')
329
330 if res.startswith('"') or res.startswith("'"):
331 res = res[1:-1]
a921f407 332
38285056 333 return unescapeHTML(res)
a921f407 334
c5229f39 335
8bb56eee
BF
336class HTMLAttributeParser(compat_HTMLParser):
337 """Trivial HTML parser to gather the attributes for a single element"""
338 def __init__(self):
c5229f39 339 self.attrs = {}
8bb56eee
BF
340 compat_HTMLParser.__init__(self)
341
342 def handle_starttag(self, tag, attrs):
343 self.attrs = dict(attrs)
344
c5229f39 345
8bb56eee
BF
346def extract_attributes(html_element):
347 """Given a string for an HTML element such as
348 <el
349 a="foo" B="bar" c="&98;az" d=boz
350 empty= noval entity="&amp;"
351 sq='"' dq="'"
352 >
353 Decode and return a dictionary of attributes.
354 {
355 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
356 'empty': '', 'noval': None, 'entity': '&',
357 'sq': '"', 'dq': '\''
358 }.
359 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
360 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
361 """
362 parser = HTMLAttributeParser()
363 parser.feed(html_element)
364 parser.close()
365 return parser.attrs
9e6dd238 366
c5229f39 367
9e6dd238 368def clean_html(html):
59ae15a5 369 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
370
371 if html is None: # Convenience for sanitizing descriptions etc.
372 return html
373
59ae15a5
PH
374 # Newline vs <br />
375 html = html.replace('\n', ' ')
6b3aef80
FV
376 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
377 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
378 # Strip html tags
379 html = re.sub('<.*?>', '', html)
380 # Replace html entities
381 html = unescapeHTML(html)
7decf895 382 return html.strip()
9e6dd238
FV
383
384
d77c3dfd 385def sanitize_open(filename, open_mode):
59ae15a5
PH
386 """Try to open the given filename, and slightly tweak it if this fails.
387
388 Attempts to open the given filename. If this fails, it tries to change
389 the filename slightly, step by step, until it's either able to open it
390 or it fails and raises a final exception, like the standard open()
391 function.
392
393 It returns the tuple (stream, definitive_file_name).
394 """
395 try:
28e614de 396 if filename == '-':
59ae15a5
PH
397 if sys.platform == 'win32':
398 import msvcrt
399 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 400 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
401 stream = open(encodeFilename(filename), open_mode)
402 return (stream, filename)
403 except (IOError, OSError) as err:
f45c185f
PH
404 if err.errno in (errno.EACCES,):
405 raise
59ae15a5 406
f45c185f 407 # In case of error, try to remove win32 forbidden chars
d55de57b 408 alt_filename = sanitize_path(filename)
f45c185f
PH
409 if alt_filename == filename:
410 raise
411 else:
412 # An exception here should be caught in the caller
d55de57b 413 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 414 return (stream, alt_filename)
d77c3dfd
FV
415
416
417def timeconvert(timestr):
59ae15a5
PH
418 """Convert RFC 2822 defined time string into system timestamp"""
419 timestamp = None
420 timetuple = email.utils.parsedate_tz(timestr)
421 if timetuple is not None:
422 timestamp = email.utils.mktime_tz(timetuple)
423 return timestamp
1c469a94 424
5f6a1245 425
796173d0 426def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
427 """Sanitizes a string so it could be used as part of a filename.
428 If restricted is set, use a stricter subset of allowed characters.
796173d0 429 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
430 """
431 def replace_insane(char):
c587cbb7
AT
432 if restricted and char in ACCENT_CHARS:
433 return ACCENT_CHARS[char]
59ae15a5
PH
434 if char == '?' or ord(char) < 32 or ord(char) == 127:
435 return ''
436 elif char == '"':
437 return '' if restricted else '\''
438 elif char == ':':
439 return '_-' if restricted else ' -'
440 elif char in '\\/|*<>':
441 return '_'
627dcfff 442 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
443 return '_'
444 if restricted and ord(char) > 127:
445 return '_'
446 return char
447
2aeb06d6
PH
448 # Handle timestamps
449 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 450 result = ''.join(map(replace_insane, s))
796173d0
PH
451 if not is_id:
452 while '__' in result:
453 result = result.replace('__', '_')
454 result = result.strip('_')
455 # Common case of "Foreign band name - English song title"
456 if restricted and result.startswith('-_'):
457 result = result[2:]
5a42414b
PH
458 if result.startswith('-'):
459 result = '_' + result[len('-'):]
a7440261 460 result = result.lstrip('.')
796173d0
PH
461 if not result:
462 result = '_'
59ae15a5 463 return result
d77c3dfd 464
5f6a1245 465
a2aaf4db
S
466def sanitize_path(s):
467 """Sanitizes and normalizes path on Windows"""
468 if sys.platform != 'win32':
469 return s
be531ef1
S
470 drive_or_unc, _ = os.path.splitdrive(s)
471 if sys.version_info < (2, 7) and not drive_or_unc:
472 drive_or_unc, _ = os.path.splitunc(s)
473 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
474 if drive_or_unc:
a2aaf4db
S
475 norm_path.pop(0)
476 sanitized_path = [
c90d16cf 477 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 478 for path_part in norm_path]
be531ef1
S
479 if drive_or_unc:
480 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
481 return os.path.join(*sanitized_path)
482
483
67dda517
S
484# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
485# unwanted failures due to missing protocol
17bcc626
S
486def sanitize_url(url):
487 return 'http:%s' % url if url.startswith('//') else url
488
489
67dda517 490def sanitized_Request(url, *args, **kwargs):
17bcc626 491 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
492
493
d77c3dfd 494def orderedSet(iterable):
59ae15a5
PH
495 """ Remove all duplicates from the input iterable """
496 res = []
497 for el in iterable:
498 if el not in res:
499 res.append(el)
500 return res
d77c3dfd 501
912b38b4 502
55b2f099 503def _htmlentity_transform(entity_with_semicolon):
4e408e47 504 """Transforms an HTML entity to a character."""
55b2f099
YCH
505 entity = entity_with_semicolon[:-1]
506
4e408e47
PH
507 # Known non-numeric HTML entity
508 if entity in compat_html_entities.name2codepoint:
509 return compat_chr(compat_html_entities.name2codepoint[entity])
510
55b2f099
YCH
511 # TODO: HTML5 allows entities without a semicolon. For example,
512 # '&Eacuteric' should be decoded as 'Éric'.
513 if entity_with_semicolon in compat_html_entities_html5:
514 return compat_html_entities_html5[entity_with_semicolon]
515
91757b0f 516 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
517 if mobj is not None:
518 numstr = mobj.group(1)
28e614de 519 if numstr.startswith('x'):
4e408e47 520 base = 16
28e614de 521 numstr = '0%s' % numstr
4e408e47
PH
522 else:
523 base = 10
7aefc49c
S
524 # See https://github.com/rg3/youtube-dl/issues/7518
525 try:
526 return compat_chr(int(numstr, base))
527 except ValueError:
528 pass
4e408e47
PH
529
530 # Unknown entity in name, return its literal representation
7a3f0c00 531 return '&%s;' % entity
4e408e47
PH
532
533
d77c3dfd 534def unescapeHTML(s):
912b38b4
PH
535 if s is None:
536 return None
537 assert type(s) == compat_str
d77c3dfd 538
4e408e47 539 return re.sub(
55b2f099 540 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 541
8bf48f23 542
aa49acd1
S
543def get_subprocess_encoding():
544 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
545 # For subprocess calls, encode with locale encoding
546 # Refer to http://stackoverflow.com/a/9951851/35070
547 encoding = preferredencoding()
548 else:
549 encoding = sys.getfilesystemencoding()
550 if encoding is None:
551 encoding = 'utf-8'
552 return encoding
553
554
8bf48f23 555def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
556 """
557 @param s The name of the file
558 """
d77c3dfd 559
8bf48f23 560 assert type(s) == compat_str
d77c3dfd 561
59ae15a5
PH
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
564 return s
0f00efed 565
aa49acd1
S
566 # Pass '' directly to use Unicode APIs on Windows 2000 and up
567 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
568 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
569 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
570 return s
571
8ee239e9
YCH
572 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
573 if sys.platform.startswith('java'):
574 return s
575
aa49acd1
S
576 return s.encode(get_subprocess_encoding(), 'ignore')
577
578
579def decodeFilename(b, for_subprocess=False):
580
581 if sys.version_info >= (3, 0):
582 return b
583
584 if not isinstance(b, bytes):
585 return b
586
587 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 588
f07b74fc
PH
589
590def encodeArgument(s):
591 if not isinstance(s, compat_str):
592 # Legacy code that uses byte strings
593 # Uncomment the following line after fixing all post processors
7af808a5 594 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
595 s = s.decode('ascii')
596 return encodeFilename(s, True)
597
598
aa49acd1
S
599def decodeArgument(b):
600 return decodeFilename(b, True)
601
602
8271226a
PH
603def decodeOption(optval):
604 if optval is None:
605 return optval
606 if isinstance(optval, bytes):
607 optval = optval.decode(preferredencoding())
608
609 assert isinstance(optval, compat_str)
610 return optval
1c256f70 611
5f6a1245 612
4539dd30
PH
613def formatSeconds(secs):
614 if secs > 3600:
615 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
616 elif secs > 60:
617 return '%d:%02d' % (secs // 60, secs % 60)
618 else:
619 return '%d' % secs
620
a0ddb8a2 621
be4a824d
PH
622def make_HTTPS_handler(params, **kwargs):
623 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 624 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 625 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 626 if opts_no_check_certificate:
be5f2c19 627 context.check_hostname = False
0db261ba 628 context.verify_mode = ssl.CERT_NONE
a2366922 629 try:
be4a824d 630 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
631 except TypeError:
632 # Python 2.7.8
633 # (create_default_context present but HTTPSHandler has no context=)
634 pass
635
636 if sys.version_info < (3, 2):
d7932313 637 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 638 else: # Python < 3.4
d7932313 639 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 640 context.verify_mode = (ssl.CERT_NONE
dca08720 641 if opts_no_check_certificate
ea6d901e 642 else ssl.CERT_REQUIRED)
303b479e 643 context.set_default_verify_paths()
be4a824d 644 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 645
732ea2f0 646
08f2a92c
JMF
647def bug_reports_message():
648 if ytdl_is_updateable():
649 update_cmd = 'type youtube-dl -U to update'
650 else:
651 update_cmd = 'see https://yt-dl.org/update on how to update'
652 msg = '; please report this issue on https://yt-dl.org/bug .'
653 msg += ' Make sure you are using the latest version; %s.' % update_cmd
654 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
655 return msg
656
657
1c256f70
PH
658class ExtractorError(Exception):
659 """Error during info extraction."""
5f6a1245 660
d11271dd 661 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
662 """ tb, if given, is the original traceback (so that it can be printed out).
663 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
664 """
665
666 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
667 expected = True
d11271dd
PH
668 if video_id is not None:
669 msg = video_id + ': ' + msg
410f3e73 670 if cause:
28e614de 671 msg += ' (caused by %r)' % cause
9a82b238 672 if not expected:
08f2a92c 673 msg += bug_reports_message()
1c256f70 674 super(ExtractorError, self).__init__(msg)
d5979c5d 675
1c256f70 676 self.traceback = tb
8cc83b8d 677 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 678 self.cause = cause
d11271dd 679 self.video_id = video_id
1c256f70 680
01951dda
PH
681 def format_traceback(self):
682 if self.traceback is None:
683 return None
28e614de 684 return ''.join(traceback.format_tb(self.traceback))
01951dda 685
1c256f70 686
416c7fcb
PH
687class UnsupportedError(ExtractorError):
688 def __init__(self, url):
689 super(UnsupportedError, self).__init__(
690 'Unsupported URL: %s' % url, expected=True)
691 self.url = url
692
693
55b3e45b
JMF
694class RegexNotFoundError(ExtractorError):
695 """Error when a regex didn't match"""
696 pass
697
698
d77c3dfd 699class DownloadError(Exception):
59ae15a5 700 """Download Error exception.
d77c3dfd 701
59ae15a5
PH
702 This exception may be thrown by FileDownloader objects if they are not
703 configured to continue on errors. They will contain the appropriate
704 error message.
705 """
5f6a1245 706
8cc83b8d
FV
707 def __init__(self, msg, exc_info=None):
708 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
709 super(DownloadError, self).__init__(msg)
710 self.exc_info = exc_info
d77c3dfd
FV
711
712
713class SameFileError(Exception):
59ae15a5 714 """Same File exception.
d77c3dfd 715
59ae15a5
PH
716 This exception will be thrown by FileDownloader objects if they detect
717 multiple files would have to be downloaded to the same file on disk.
718 """
719 pass
d77c3dfd
FV
720
721
722class PostProcessingError(Exception):
59ae15a5 723 """Post Processing exception.
d77c3dfd 724
59ae15a5
PH
725 This exception may be raised by PostProcessor's .run() method to
726 indicate an error in the postprocessing task.
727 """
5f6a1245 728
7851b379
PH
729 def __init__(self, msg):
730 self.msg = msg
d77c3dfd 731
5f6a1245 732
d77c3dfd 733class MaxDownloadsReached(Exception):
59ae15a5
PH
734 """ --max-downloads limit has been reached. """
735 pass
d77c3dfd
FV
736
737
738class UnavailableVideoError(Exception):
59ae15a5 739 """Unavailable Format exception.
d77c3dfd 740
59ae15a5
PH
741 This exception will be thrown when a video is requested
742 in a format that is not available for that video.
743 """
744 pass
d77c3dfd
FV
745
746
747class ContentTooShortError(Exception):
59ae15a5 748 """Content Too Short exception.
d77c3dfd 749
59ae15a5
PH
750 This exception may be raised by FileDownloader objects when a file they
751 download is too small for what the server announced first, indicating
752 the connection was probably interrupted.
753 """
d77c3dfd 754
59ae15a5 755 def __init__(self, downloaded, expected):
2c7ed247 756 # Both in bytes
59ae15a5
PH
757 self.downloaded = downloaded
758 self.expected = expected
d77c3dfd 759
5f6a1245 760
c5a59d93 761def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
762 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
763 # expected HTTP responses to meet HTTP/1.0 or later (see also
764 # https://github.com/rg3/youtube-dl/issues/6727)
765 if sys.version_info < (3, 0):
5a1a2e94 766 kwargs[b'strict'] = True
be4a824d
PH
767 hc = http_class(*args, **kwargs)
768 source_address = ydl_handler._params.get('source_address')
769 if source_address is not None:
770 sa = (source_address, 0)
771 if hasattr(hc, 'source_address'): # Python 2.7+
772 hc.source_address = sa
773 else: # Python 2.6
774 def _hc_connect(self, *args, **kwargs):
775 sock = compat_socket_create_connection(
776 (self.host, self.port), self.timeout, sa)
777 if is_https:
d7932313
PH
778 self.sock = ssl.wrap_socket(
779 sock, self.key_file, self.cert_file,
780 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
781 else:
782 self.sock = sock
783 hc.connect = functools.partial(_hc_connect, hc)
784
785 return hc
786
787
87f0e62d 788def handle_youtubedl_headers(headers):
992fc9d6
YCH
789 filtered_headers = headers
790
791 if 'Youtubedl-no-compression' in filtered_headers:
792 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 793 del filtered_headers['Youtubedl-no-compression']
87f0e62d 794
992fc9d6 795 return filtered_headers
87f0e62d
YCH
796
797
acebc9cd 798class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
799 """Handler for HTTP requests and responses.
800
801 This class, when installed with an OpenerDirector, automatically adds
802 the standard headers to every HTTP request and handles gzipped and
803 deflated responses from web servers. If compression is to be avoided in
804 a particular request, the original request in the program code only has
0424ec30 805 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
806 removed before making the real request.
807
808 Part of this code was copied from:
809
810 http://techknack.net/python-urllib2-handlers/
811
812 Andrew Rowls, the author of that code, agreed to release it to the
813 public domain.
814 """
815
be4a824d
PH
816 def __init__(self, params, *args, **kwargs):
817 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
818 self._params = params
819
820 def http_open(self, req):
71aff188
YCH
821 conn_class = compat_http_client.HTTPConnection
822
823 socks_proxy = req.headers.get('Ytdl-socks-proxy')
824 if socks_proxy:
825 conn_class = make_socks_conn_class(conn_class, socks_proxy)
826 del req.headers['Ytdl-socks-proxy']
827
be4a824d 828 return self.do_open(functools.partial(
71aff188 829 _create_http_connection, self, conn_class, False),
be4a824d
PH
830 req)
831
59ae15a5
PH
832 @staticmethod
833 def deflate(data):
834 try:
835 return zlib.decompress(data, -zlib.MAX_WBITS)
836 except zlib.error:
837 return zlib.decompress(data)
838
839 @staticmethod
840 def addinfourl_wrapper(stream, headers, url, code):
841 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
842 return compat_urllib_request.addinfourl(stream, headers, url, code)
843 ret = compat_urllib_request.addinfourl(stream, headers, url)
844 ret.code = code
845 return ret
846
acebc9cd 847 def http_request(self, req):
51f267d9
S
848 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
849 # always respected by websites, some tend to give out URLs with non percent-encoded
850 # non-ASCII characters (see telemb.py, ard.py [#3412])
851 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
852 # To work around aforementioned issue we will replace request's original URL with
853 # percent-encoded one
854 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
855 # the code of this workaround has been moved here from YoutubeDL.urlopen()
856 url = req.get_full_url()
857 url_escaped = escape_url(url)
858
859 # Substitute URL if any change after escaping
860 if url != url_escaped:
15d260eb 861 req = update_Request(req, url=url_escaped)
51f267d9 862
33ac271b 863 for h, v in std_headers.items():
3d5f7a39
JK
864 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
865 # The dict keys are capitalized because of this bug by urllib
866 if h.capitalize() not in req.headers:
33ac271b 867 req.add_header(h, v)
87f0e62d
YCH
868
869 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
870
871 if sys.version_info < (2, 7) and '#' in req.get_full_url():
872 # Python 2.6 is brain-dead when it comes to fragments
873 req._Request__original = req._Request__original.partition('#')[0]
874 req._Request__r_type = req._Request__r_type.partition('#')[0]
875
59ae15a5
PH
876 return req
877
acebc9cd 878 def http_response(self, req, resp):
59ae15a5
PH
879 old_resp = resp
880 # gzip
881 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
882 content = resp.read()
883 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
884 try:
885 uncompressed = io.BytesIO(gz.read())
886 except IOError as original_ioerror:
887 # There may be junk add the end of the file
888 # See http://stackoverflow.com/q/4928560/35070 for details
889 for i in range(1, 1024):
890 try:
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
892 uncompressed = io.BytesIO(gz.read())
893 except IOError:
894 continue
895 break
896 else:
897 raise original_ioerror
898 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 899 resp.msg = old_resp.msg
c047270c 900 del resp.headers['Content-encoding']
59ae15a5
PH
901 # deflate
902 if resp.headers.get('Content-encoding', '') == 'deflate':
903 gz = io.BytesIO(self.deflate(resp.read()))
904 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
905 resp.msg = old_resp.msg
c047270c 906 del resp.headers['Content-encoding']
ad729172
S
907 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
908 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
909 if 300 <= resp.code < 400:
910 location = resp.headers.get('Location')
911 if location:
912 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
913 if sys.version_info >= (3, 0):
914 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
915 else:
916 location = location.decode('utf-8')
5a4d9ddb
S
917 location_escaped = escape_url(location)
918 if location != location_escaped:
919 del resp.headers['Location']
9a4aec8b
YCH
920 if sys.version_info < (3, 0):
921 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 922 resp.headers['Location'] = location_escaped
59ae15a5 923 return resp
0f8d03f8 924
acebc9cd
PH
925 https_request = http_request
926 https_response = http_response
bf50b038 927
5de90176 928
71aff188
YCH
929def make_socks_conn_class(base_class, socks_proxy):
930 assert issubclass(base_class, (
931 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
932
933 url_components = compat_urlparse.urlparse(socks_proxy)
934 if url_components.scheme.lower() == 'socks5':
935 socks_type = ProxyType.SOCKS5
936 elif url_components.scheme.lower() in ('socks', 'socks4'):
937 socks_type = ProxyType.SOCKS4
51fb4995
YCH
938 elif url_components.scheme.lower() == 'socks4a':
939 socks_type = ProxyType.SOCKS4A
71aff188 940
cdd94c2e
YCH
941 def unquote_if_non_empty(s):
942 if not s:
943 return s
944 return compat_urllib_parse_unquote_plus(s)
945
71aff188
YCH
946 proxy_args = (
947 socks_type,
948 url_components.hostname, url_components.port or 1080,
949 True, # Remote DNS
cdd94c2e
YCH
950 unquote_if_non_empty(url_components.username),
951 unquote_if_non_empty(url_components.password),
71aff188
YCH
952 )
953
954 class SocksConnection(base_class):
955 def connect(self):
956 self.sock = sockssocket()
957 self.sock.setproxy(*proxy_args)
958 if type(self.timeout) in (int, float):
959 self.sock.settimeout(self.timeout)
960 self.sock.connect((self.host, self.port))
961
962 if isinstance(self, compat_http_client.HTTPSConnection):
963 if hasattr(self, '_context'): # Python > 2.6
964 self.sock = self._context.wrap_socket(
965 self.sock, server_hostname=self.host)
966 else:
967 self.sock = ssl.wrap_socket(self.sock)
968
969 return SocksConnection
970
971
be4a824d
PH
972class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
973 def __init__(self, params, https_conn_class=None, *args, **kwargs):
974 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
975 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
976 self._params = params
977
978 def https_open(self, req):
4f264c02 979 kwargs = {}
71aff188
YCH
980 conn_class = self._https_conn_class
981
4f264c02
JMF
982 if hasattr(self, '_context'): # python > 2.6
983 kwargs['context'] = self._context
984 if hasattr(self, '_check_hostname'): # python 3.x
985 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
986
987 socks_proxy = req.headers.get('Ytdl-socks-proxy')
988 if socks_proxy:
989 conn_class = make_socks_conn_class(conn_class, socks_proxy)
990 del req.headers['Ytdl-socks-proxy']
991
be4a824d 992 return self.do_open(functools.partial(
71aff188 993 _create_http_connection, self, conn_class, True),
4f264c02 994 req, **kwargs)
be4a824d
PH
995
996
a6420bf5
S
997class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
998 def __init__(self, cookiejar=None):
999 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001 def http_response(self, request, response):
1002 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003 # characters in Set-Cookie HTTP header of last response (see
1004 # https://github.com/rg3/youtube-dl/issues/6769).
1005 # In order to at least prevent crashing we will percent encode Set-Cookie
1006 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1007 # if sys.version_info < (3, 0) and response.headers:
1008 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009 # set_cookie = response.headers.get(set_cookie_header)
1010 # if set_cookie:
1011 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012 # if set_cookie != set_cookie_escaped:
1013 # del response.headers[set_cookie_header]
1014 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1015 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018 https_response = http_response
1019
1020
46f59e89
S
1021def extract_timezone(date_str):
1022 m = re.search(
1023 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024 date_str)
1025 if not m:
1026 timezone = datetime.timedelta()
1027 else:
1028 date_str = date_str[:-len(m.group('tz'))]
1029 if not m.group('sign'):
1030 timezone = datetime.timedelta()
1031 else:
1032 sign = 1 if m.group('sign') == '+' else -1
1033 timezone = datetime.timedelta(
1034 hours=sign * int(m.group('hours')),
1035 minutes=sign * int(m.group('minutes')))
1036 return timezone, date_str
1037
1038
08b38d54 1039def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1040 """ Return a UNIX timestamp from the given date """
1041
1042 if date_str is None:
1043 return None
1044
52c3a6e4
S
1045 date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
08b38d54 1047 if timezone is None:
46f59e89
S
1048 timezone, date_str = extract_timezone(date_str)
1049
52c3a6e4
S
1050 try:
1051 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053 return calendar.timegm(dt.timetuple())
1054 except ValueError:
1055 pass
912b38b4
PH
1056
1057
46f59e89
S
1058def date_formats(day_first=True):
1059 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
42bdd9d0 1062def unified_strdate(date_str, day_first=True):
bf50b038 1063 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1064
1065 if date_str is None:
1066 return None
bf50b038 1067 upload_date = None
5f6a1245 1068 # Replace commas
026fcc04 1069 date_str = date_str.replace(',', ' ')
42bdd9d0 1070 # Remove AM/PM + timezone
9bb8e0a3 1071 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1072 _, date_str = extract_timezone(date_str)
42bdd9d0 1073
46f59e89 1074 for expression in date_formats(day_first):
bf50b038
JMF
1075 try:
1076 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1077 except ValueError:
bf50b038 1078 pass
42393ce2
PH
1079 if upload_date is None:
1080 timetuple = email.utils.parsedate_tz(date_str)
1081 if timetuple:
c6b9cf05
S
1082 try:
1083 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084 except ValueError:
1085 pass
6a750402
JMF
1086 if upload_date is not None:
1087 return compat_str(upload_date)
bf50b038 1088
5f6a1245 1089
46f59e89
S
1090def unified_timestamp(date_str, day_first=True):
1091 if date_str is None:
1092 return None
1093
1094 date_str = date_str.replace(',', ' ')
1095
1096 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097 timezone, date_str = extract_timezone(date_str)
1098
1099 # Remove AM/PM + timezone
1100 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102 for expression in date_formats(day_first):
1103 try:
1104 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105 return calendar.timegm(dt.timetuple())
1106 except ValueError:
1107 pass
1108 timetuple = email.utils.parsedate_tz(date_str)
1109 if timetuple:
1110 return calendar.timegm(timetuple.timetuple())
1111
1112
28e614de 1113def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1114 if url is None:
1115 return default_ext
9cb9a5df 1116 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1117 if re.match(r'^[A-Za-z0-9]+$', guess):
1118 return guess
a7aaa398
S
1119 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1121 return guess.rstrip('/')
73e79f2a 1122 else:
cbdbb766 1123 return default_ext
73e79f2a 1124
5f6a1245 1125
d4051a8e 1126def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1127 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1128
5f6a1245 1129
bd558525 1130def date_from_str(date_str):
37254abc
JMF
1131 """
1132 Return a datetime object from a string in the format YYYYMMDD or
1133 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134 today = datetime.date.today()
f8795e10 1135 if date_str in ('now', 'today'):
37254abc 1136 return today
f8795e10
PH
1137 if date_str == 'yesterday':
1138 return today - datetime.timedelta(days=1)
37254abc
JMF
1139 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140 if match is not None:
1141 sign = match.group('sign')
1142 time = int(match.group('time'))
1143 if sign == '-':
1144 time = -time
1145 unit = match.group('unit')
dfb1b146 1146 # A bad approximation?
37254abc
JMF
1147 if unit == 'month':
1148 unit = 'day'
1149 time *= 30
1150 elif unit == 'year':
1151 unit = 'day'
1152 time *= 365
1153 unit += 's'
1154 delta = datetime.timedelta(**{unit: time})
1155 return today + delta
611c1dd9 1156 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1157
1158
e63fc1be 1159def hyphenate_date(date_str):
1160 """
1161 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163 if match is not None:
1164 return '-'.join(match.groups())
1165 else:
1166 return date_str
1167
5f6a1245 1168
bd558525
JMF
1169class DateRange(object):
1170 """Represents a time interval between two dates"""
5f6a1245 1171
bd558525
JMF
1172 def __init__(self, start=None, end=None):
1173 """start and end must be strings in the format accepted by date"""
1174 if start is not None:
1175 self.start = date_from_str(start)
1176 else:
1177 self.start = datetime.datetime.min.date()
1178 if end is not None:
1179 self.end = date_from_str(end)
1180 else:
1181 self.end = datetime.datetime.max.date()
37254abc 1182 if self.start > self.end:
bd558525 1183 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1184
bd558525
JMF
1185 @classmethod
1186 def day(cls, day):
1187 """Returns a range that only contains the given day"""
5f6a1245
JW
1188 return cls(day, day)
1189
bd558525
JMF
1190 def __contains__(self, date):
1191 """Check if the date is in the range"""
37254abc
JMF
1192 if not isinstance(date, datetime.date):
1193 date = date_from_str(date)
1194 return self.start <= date <= self.end
5f6a1245 1195
bd558525 1196 def __str__(self):
5f6a1245 1197 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1198
1199
1200def platform_name():
1201 """ Returns the platform name as a compat_str """
1202 res = platform.platform()
1203 if isinstance(res, bytes):
1204 res = res.decode(preferredencoding())
1205
1206 assert isinstance(res, compat_str)
1207 return res
c257baff
PH
1208
1209
b58ddb32
PH
1210def _windows_write_string(s, out):
1211 """ Returns True if the string was written using special methods,
1212 False if it has yet to be written out."""
1213 # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215 import ctypes
1216 import ctypes.wintypes
1217
1218 WIN_OUTPUT_IDS = {
1219 1: -11,
1220 2: -12,
1221 }
1222
a383a98a
PH
1223 try:
1224 fileno = out.fileno()
1225 except AttributeError:
1226 # If the output stream doesn't have a fileno, it's virtual
1227 return False
aa42e873
PH
1228 except io.UnsupportedOperation:
1229 # Some strange Windows pseudo files?
1230 return False
b58ddb32
PH
1231 if fileno not in WIN_OUTPUT_IDS:
1232 return False
1233
e2f89ec7 1234 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1235 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1236 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1237 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
e2f89ec7 1239 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1240 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1242 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1243 written = ctypes.wintypes.DWORD(0)
1244
611c1dd9 1245 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1246 FILE_TYPE_CHAR = 0x0002
1247 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1248 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1249 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1251 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1252 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254 def not_a_console(handle):
1255 if handle == INVALID_HANDLE_VALUE or handle is None:
1256 return True
8fb3ac36
PH
1257 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1259
1260 if not_a_console(h):
1261 return False
1262
d1b9c912
PH
1263 def next_nonbmp_pos(s):
1264 try:
1265 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266 except StopIteration:
1267 return len(s)
1268
1269 while s:
1270 count = min(next_nonbmp_pos(s), 1024)
1271
b58ddb32 1272 ret = WriteConsoleW(
d1b9c912 1273 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1274 if ret == 0:
1275 raise OSError('Failed to write string')
d1b9c912
PH
1276 if not count: # We just wrote a non-BMP character
1277 assert written.value == 2
1278 s = s[1:]
1279 else:
1280 assert written.value > 0
1281 s = s[written.value:]
b58ddb32
PH
1282 return True
1283
1284
734f90bb 1285def write_string(s, out=None, encoding=None):
7459e3a2
PH
1286 if out is None:
1287 out = sys.stderr
8bf48f23 1288 assert type(s) == compat_str
7459e3a2 1289
b58ddb32
PH
1290 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291 if _windows_write_string(s, out):
1292 return
1293
7459e3a2
PH
1294 if ('b' in getattr(out, 'mode', '') or
1295 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1296 byt = s.encode(encoding or preferredencoding(), 'ignore')
1297 out.write(byt)
1298 elif hasattr(out, 'buffer'):
1299 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300 byt = s.encode(enc, 'ignore')
1301 out.buffer.write(byt)
1302 else:
8bf48f23 1303 out.write(s)
7459e3a2
PH
1304 out.flush()
1305
1306
48ea9cea
PH
1307def bytes_to_intlist(bs):
1308 if not bs:
1309 return []
1310 if isinstance(bs[0], int): # Python 3
1311 return list(bs)
1312 else:
1313 return [ord(c) for c in bs]
1314
c257baff 1315
cba892fa 1316def intlist_to_bytes(xs):
1317 if not xs:
1318 return b''
edaa23f8 1319 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1320
1321
c1c9a79c
PH
1322# Cross-platform file locking
1323if sys.platform == 'win32':
1324 import ctypes.wintypes
1325 import msvcrt
1326
1327 class OVERLAPPED(ctypes.Structure):
1328 _fields_ = [
1329 ('Internal', ctypes.wintypes.LPVOID),
1330 ('InternalHigh', ctypes.wintypes.LPVOID),
1331 ('Offset', ctypes.wintypes.DWORD),
1332 ('OffsetHigh', ctypes.wintypes.DWORD),
1333 ('hEvent', ctypes.wintypes.HANDLE),
1334 ]
1335
1336 kernel32 = ctypes.windll.kernel32
1337 LockFileEx = kernel32.LockFileEx
1338 LockFileEx.argtypes = [
1339 ctypes.wintypes.HANDLE, # hFile
1340 ctypes.wintypes.DWORD, # dwFlags
1341 ctypes.wintypes.DWORD, # dwReserved
1342 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1343 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1344 ctypes.POINTER(OVERLAPPED) # Overlapped
1345 ]
1346 LockFileEx.restype = ctypes.wintypes.BOOL
1347 UnlockFileEx = kernel32.UnlockFileEx
1348 UnlockFileEx.argtypes = [
1349 ctypes.wintypes.HANDLE, # hFile
1350 ctypes.wintypes.DWORD, # dwReserved
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1352 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1353 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 ]
1355 UnlockFileEx.restype = ctypes.wintypes.BOOL
1356 whole_low = 0xffffffff
1357 whole_high = 0x7fffffff
1358
1359 def _lock_file(f, exclusive):
1360 overlapped = OVERLAPPED()
1361 overlapped.Offset = 0
1362 overlapped.OffsetHigh = 0
1363 overlapped.hEvent = 0
1364 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365 handle = msvcrt.get_osfhandle(f.fileno())
1366 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367 whole_low, whole_high, f._lock_file_overlapped_p):
1368 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370 def _unlock_file(f):
1371 assert f._lock_file_overlapped_p
1372 handle = msvcrt.get_osfhandle(f.fileno())
1373 if not UnlockFileEx(handle, 0,
1374 whole_low, whole_high, f._lock_file_overlapped_p):
1375 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377else:
399a76e6
YCH
1378 # Some platforms, such as Jython, is missing fcntl
1379 try:
1380 import fcntl
c1c9a79c 1381
399a76e6
YCH
1382 def _lock_file(f, exclusive):
1383 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1384
399a76e6
YCH
1385 def _unlock_file(f):
1386 fcntl.flock(f, fcntl.LOCK_UN)
1387 except ImportError:
1388 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390 def _lock_file(f, exclusive):
1391 raise IOError(UNSUPPORTED_MSG)
1392
1393 def _unlock_file(f):
1394 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1395
1396
1397class locked_file(object):
1398 def __init__(self, filename, mode, encoding=None):
1399 assert mode in ['r', 'a', 'w']
1400 self.f = io.open(filename, mode, encoding=encoding)
1401 self.mode = mode
1402
1403 def __enter__(self):
1404 exclusive = self.mode != 'r'
1405 try:
1406 _lock_file(self.f, exclusive)
1407 except IOError:
1408 self.f.close()
1409 raise
1410 return self
1411
1412 def __exit__(self, etype, value, traceback):
1413 try:
1414 _unlock_file(self.f)
1415 finally:
1416 self.f.close()
1417
1418 def __iter__(self):
1419 return iter(self.f)
1420
1421 def write(self, *args):
1422 return self.f.write(*args)
1423
1424 def read(self, *args):
1425 return self.f.read(*args)
4eb7f1d1
JMF
1426
1427
4644ac55
S
1428def get_filesystem_encoding():
1429 encoding = sys.getfilesystemencoding()
1430 return encoding if encoding is not None else 'utf-8'
1431
1432
4eb7f1d1 1433def shell_quote(args):
a6a173c2 1434 quoted_args = []
4644ac55 1435 encoding = get_filesystem_encoding()
a6a173c2
JMF
1436 for a in args:
1437 if isinstance(a, bytes):
1438 # We may get a filename encoded with 'encodeFilename'
1439 a = a.decode(encoding)
1440 quoted_args.append(pipes.quote(a))
28e614de 1441 return ' '.join(quoted_args)
9d4660ca
PH
1442
1443
1444def smuggle_url(url, data):
1445 """ Pass additional data in a URL for internal use. """
1446
81953d1a
RA
1447 url, idata = unsmuggle_url(url, {})
1448 data.update(idata)
15707c7e 1449 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1450 {'__youtubedl_smuggle': json.dumps(data)})
1451 return url + '#' + sdata
9d4660ca
PH
1452
1453
79f82953 1454def unsmuggle_url(smug_url, default=None):
83e865a3 1455 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1456 return smug_url, default
28e614de
PH
1457 url, _, sdata = smug_url.rpartition('#')
1458 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1459 data = json.loads(jsond)
1460 return url, data
02dbf93f
PH
1461
1462
02dbf93f
PH
1463def format_bytes(bytes):
1464 if bytes is None:
28e614de 1465 return 'N/A'
02dbf93f
PH
1466 if type(bytes) is str:
1467 bytes = float(bytes)
1468 if bytes == 0.0:
1469 exponent = 0
1470 else:
1471 exponent = int(math.log(bytes, 1024.0))
28e614de 1472 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1473 converted = float(bytes) / float(1024 ** exponent)
28e614de 1474 return '%.2f%s' % (converted, suffix)
f53c966a 1475
1c088fa8 1476
fb47597b
S
1477def lookup_unit_table(unit_table, s):
1478 units_re = '|'.join(re.escape(u) for u in unit_table)
1479 m = re.match(
782b1b5b 1480 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1481 if not m:
1482 return None
1483 num_str = m.group('num').replace(',', '.')
1484 mult = unit_table[m.group('unit')]
1485 return int(float(num_str) * mult)
1486
1487
be64b5b0
PH
1488def parse_filesize(s):
1489 if s is None:
1490 return None
1491
dfb1b146 1492 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1493 # but we support those too
1494 _UNIT_TABLE = {
1495 'B': 1,
1496 'b': 1,
1497 'KiB': 1024,
1498 'KB': 1000,
1499 'kB': 1024,
1500 'Kb': 1000,
1501 'MiB': 1024 ** 2,
1502 'MB': 1000 ** 2,
1503 'mB': 1024 ** 2,
1504 'Mb': 1000 ** 2,
1505 'GiB': 1024 ** 3,
1506 'GB': 1000 ** 3,
1507 'gB': 1024 ** 3,
1508 'Gb': 1000 ** 3,
1509 'TiB': 1024 ** 4,
1510 'TB': 1000 ** 4,
1511 'tB': 1024 ** 4,
1512 'Tb': 1000 ** 4,
1513 'PiB': 1024 ** 5,
1514 'PB': 1000 ** 5,
1515 'pB': 1024 ** 5,
1516 'Pb': 1000 ** 5,
1517 'EiB': 1024 ** 6,
1518 'EB': 1000 ** 6,
1519 'eB': 1024 ** 6,
1520 'Eb': 1000 ** 6,
1521 'ZiB': 1024 ** 7,
1522 'ZB': 1000 ** 7,
1523 'zB': 1024 ** 7,
1524 'Zb': 1000 ** 7,
1525 'YiB': 1024 ** 8,
1526 'YB': 1000 ** 8,
1527 'yB': 1024 ** 8,
1528 'Yb': 1000 ** 8,
1529 }
1530
fb47597b
S
1531 return lookup_unit_table(_UNIT_TABLE, s)
1532
1533
1534def parse_count(s):
1535 if s is None:
be64b5b0
PH
1536 return None
1537
fb47597b
S
1538 s = s.strip()
1539
1540 if re.match(r'^[\d,.]+$', s):
1541 return str_to_int(s)
1542
1543 _UNIT_TABLE = {
1544 'k': 1000,
1545 'K': 1000,
1546 'm': 1000 ** 2,
1547 'M': 1000 ** 2,
1548 'kk': 1000 ** 2,
1549 'KK': 1000 ** 2,
1550 }
be64b5b0 1551
fb47597b 1552 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1553
2f7ae819 1554
caefb1de
PH
1555def month_by_name(name):
1556 """ Return the number of a month by (locale-independently) English name """
1557
caefb1de 1558 try:
7105440c
YCH
1559 return ENGLISH_MONTH_NAMES.index(name) + 1
1560 except ValueError:
1561 return None
1562
1563
1564def month_by_abbreviation(abbrev):
1565 """ Return the number of a month by (locale-independently) English
1566 abbreviations """
1567
1568 try:
1569 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1570 except ValueError:
1571 return None
18258362
JMF
1572
1573
5aafe895 1574def fix_xml_ampersands(xml_str):
18258362 1575 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1576 return re.sub(
1577 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1578 '&amp;',
5aafe895 1579 xml_str)
e3946f98
PH
1580
1581
1582def setproctitle(title):
8bf48f23 1583 assert isinstance(title, compat_str)
c1c05c67
YCH
1584
1585 # ctypes in Jython is not complete
1586 # http://bugs.jython.org/issue2148
1587 if sys.platform.startswith('java'):
1588 return
1589
e3946f98 1590 try:
611c1dd9 1591 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1592 except OSError:
1593 return
6eefe533
PH
1594 title_bytes = title.encode('utf-8')
1595 buf = ctypes.create_string_buffer(len(title_bytes))
1596 buf.value = title_bytes
e3946f98 1597 try:
6eefe533 1598 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1599 except AttributeError:
1600 return # Strange libc, just skip this
d7dda168
PH
1601
1602
1603def remove_start(s, start):
46bc9b7d 1604 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1605
1606
2b9faf55 1607def remove_end(s, end):
46bc9b7d 1608 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1609
1610
31b2051e
S
1611def remove_quotes(s):
1612 if s is None or len(s) < 2:
1613 return s
1614 for quote in ('"', "'", ):
1615 if s[0] == quote and s[-1] == quote:
1616 return s[1:-1]
1617 return s
1618
1619
29eb5174 1620def url_basename(url):
9b8aaeed 1621 path = compat_urlparse.urlparse(url).path
28e614de 1622 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1623
1624
1625class HEADRequest(compat_urllib_request.Request):
1626 def get_method(self):
611c1dd9 1627 return 'HEAD'
7217e148
PH
1628
1629
95cf60e8
S
1630class PUTRequest(compat_urllib_request.Request):
1631 def get_method(self):
1632 return 'PUT'
1633
1634
9732d77e 1635def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1636 if get_attr:
1637 if v is not None:
1638 v = getattr(v, get_attr, None)
9572013d
PH
1639 if v == '':
1640 v = None
1812afb7
S
1641 if v is None:
1642 return default
1643 try:
1644 return int(v) * invscale // scale
1645 except ValueError:
af98f8ff 1646 return default
9732d77e 1647
9572013d 1648
40a90862
JMF
1649def str_or_none(v, default=None):
1650 return default if v is None else compat_str(v)
1651
9732d77e
PH
1652
1653def str_to_int(int_str):
48d4681e 1654 """ A more relaxed version of int_or_none """
9732d77e
PH
1655 if int_str is None:
1656 return None
28e614de 1657 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1658 return int(int_str)
608d11f5
PH
1659
1660
9732d77e 1661def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1662 if v is None:
1663 return default
1664 try:
1665 return float(v) * invscale / scale
1666 except ValueError:
1667 return default
43f775e4
PH
1668
1669
b72b4431
S
1670def strip_or_none(v):
1671 return None if v is None else v.strip()
1672
1673
608d11f5 1674def parse_duration(s):
8f9312c3 1675 if not isinstance(s, compat_basestring):
608d11f5
PH
1676 return None
1677
ca7b3246
S
1678 s = s.strip()
1679
acaff495 1680 days, hours, mins, secs, ms = [None] * 5
1681 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1682 if m:
1683 days, hours, mins, secs, ms = m.groups()
1684 else:
1685 m = re.match(
1686 r'''(?ix)(?:P?T)?
8f4b58d7 1687 (?:
acaff495 1688 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1689 )?
acaff495 1690 (?:
1691 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1692 )?
1693 (?:
1694 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1695 )?
1696 (?:
1697 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1698 )?$''', s)
1699 if m:
1700 days, hours, mins, secs, ms = m.groups()
1701 else:
1702 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1703 if m:
1704 hours, mins = m.groups()
1705 else:
1706 return None
1707
1708 duration = 0
1709 if secs:
1710 duration += float(secs)
1711 if mins:
1712 duration += float(mins) * 60
1713 if hours:
1714 duration += float(hours) * 60 * 60
1715 if days:
1716 duration += float(days) * 24 * 60 * 60
1717 if ms:
1718 duration += float(ms)
1719 return duration
91d7d0b3
JMF
1720
1721
e65e4c88 1722def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1723 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1724 return (
1725 '{0}.{1}{2}'.format(name, ext, real_ext)
1726 if not expected_real_ext or real_ext[1:] == expected_real_ext
1727 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1728
1729
b3ed15b7
S
1730def replace_extension(filename, ext, expected_real_ext=None):
1731 name, real_ext = os.path.splitext(filename)
1732 return '{0}.{1}'.format(
1733 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1734 ext)
1735
1736
d70ad093
PH
1737def check_executable(exe, args=[]):
1738 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1739 args can be a list of arguments for a short output (like -version) """
1740 try:
1741 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1742 except OSError:
1743 return False
1744 return exe
b7ab0590
PH
1745
1746
95807118 1747def get_exe_version(exe, args=['--version'],
cae97f65 1748 version_re=None, unrecognized='present'):
95807118
PH
1749 """ Returns the version of the specified executable,
1750 or False if the executable is not present """
1751 try:
cae97f65 1752 out, _ = subprocess.Popen(
54116803 1753 [encodeArgument(exe)] + args,
95807118
PH
1754 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1755 except OSError:
1756 return False
cae97f65
PH
1757 if isinstance(out, bytes): # Python 2.x
1758 out = out.decode('ascii', 'ignore')
1759 return detect_exe_version(out, version_re, unrecognized)
1760
1761
1762def detect_exe_version(output, version_re=None, unrecognized='present'):
1763 assert isinstance(output, compat_str)
1764 if version_re is None:
1765 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1766 m = re.search(version_re, output)
95807118
PH
1767 if m:
1768 return m.group(1)
1769 else:
1770 return unrecognized
1771
1772
b7ab0590 1773class PagedList(object):
dd26ced1
PH
1774 def __len__(self):
1775 # This is only useful for tests
1776 return len(self.getslice())
1777
9c44d242
PH
1778
1779class OnDemandPagedList(PagedList):
b95dc034 1780 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1781 self._pagefunc = pagefunc
1782 self._pagesize = pagesize
b95dc034
YCH
1783 self._use_cache = use_cache
1784 if use_cache:
1785 self._cache = {}
9c44d242 1786
b7ab0590
PH
1787 def getslice(self, start=0, end=None):
1788 res = []
1789 for pagenum in itertools.count(start // self._pagesize):
1790 firstid = pagenum * self._pagesize
1791 nextfirstid = pagenum * self._pagesize + self._pagesize
1792 if start >= nextfirstid:
1793 continue
1794
b95dc034
YCH
1795 page_results = None
1796 if self._use_cache:
1797 page_results = self._cache.get(pagenum)
1798 if page_results is None:
1799 page_results = list(self._pagefunc(pagenum))
1800 if self._use_cache:
1801 self._cache[pagenum] = page_results
b7ab0590
PH
1802
1803 startv = (
1804 start % self._pagesize
1805 if firstid <= start < nextfirstid
1806 else 0)
1807
1808 endv = (
1809 ((end - 1) % self._pagesize) + 1
1810 if (end is not None and firstid <= end <= nextfirstid)
1811 else None)
1812
1813 if startv != 0 or endv is not None:
1814 page_results = page_results[startv:endv]
1815 res.extend(page_results)
1816
1817 # A little optimization - if current page is not "full", ie. does
1818 # not contain page_size videos then we can assume that this page
1819 # is the last one - there are no more ids on further pages -
1820 # i.e. no need to query again.
1821 if len(page_results) + startv < self._pagesize:
1822 break
1823
1824 # If we got the whole page, but the next page is not interesting,
1825 # break out early as well
1826 if end == nextfirstid:
1827 break
1828 return res
81c2f20b
PH
1829
1830
9c44d242
PH
1831class InAdvancePagedList(PagedList):
1832 def __init__(self, pagefunc, pagecount, pagesize):
1833 self._pagefunc = pagefunc
1834 self._pagecount = pagecount
1835 self._pagesize = pagesize
1836
1837 def getslice(self, start=0, end=None):
1838 res = []
1839 start_page = start // self._pagesize
1840 end_page = (
1841 self._pagecount if end is None else (end // self._pagesize + 1))
1842 skip_elems = start - start_page * self._pagesize
1843 only_more = None if end is None else end - start
1844 for pagenum in range(start_page, end_page):
1845 page = list(self._pagefunc(pagenum))
1846 if skip_elems:
1847 page = page[skip_elems:]
1848 skip_elems = None
1849 if only_more is not None:
1850 if len(page) < only_more:
1851 only_more -= len(page)
1852 else:
1853 page = page[:only_more]
1854 res.extend(page)
1855 break
1856 res.extend(page)
1857 return res
1858
1859
81c2f20b 1860def uppercase_escape(s):
676eb3f2 1861 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1862 return re.sub(
a612753d 1863 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1864 lambda m: unicode_escape(m.group(0))[0],
1865 s)
0fe2ff78
YCH
1866
1867
1868def lowercase_escape(s):
1869 unicode_escape = codecs.getdecoder('unicode_escape')
1870 return re.sub(
1871 r'\\u[0-9a-fA-F]{4}',
1872 lambda m: unicode_escape(m.group(0))[0],
1873 s)
b53466e1 1874
d05cfe06
S
1875
1876def escape_rfc3986(s):
1877 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1878 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1879 s = s.encode('utf-8')
ecc0c5ee 1880 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1881
1882
1883def escape_url(url):
1884 """Escape URL as suggested by RFC 3986"""
1885 url_parsed = compat_urllib_parse_urlparse(url)
1886 return url_parsed._replace(
efbed08d 1887 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1888 path=escape_rfc3986(url_parsed.path),
1889 params=escape_rfc3986(url_parsed.params),
1890 query=escape_rfc3986(url_parsed.query),
1891 fragment=escape_rfc3986(url_parsed.fragment)
1892 ).geturl()
1893
62e609ab
PH
1894
1895def read_batch_urls(batch_fd):
1896 def fixup(url):
1897 if not isinstance(url, compat_str):
1898 url = url.decode('utf-8', 'replace')
28e614de 1899 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1900 if url.startswith(BOM_UTF8):
1901 url = url[len(BOM_UTF8):]
1902 url = url.strip()
1903 if url.startswith(('#', ';', ']')):
1904 return False
1905 return url
1906
1907 with contextlib.closing(batch_fd) as fd:
1908 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1909
1910
1911def urlencode_postdata(*args, **kargs):
15707c7e 1912 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1913
1914
38f9ef31 1915def update_url_query(url, query):
cacd9966
YCH
1916 if not query:
1917 return url
38f9ef31 1918 parsed_url = compat_urlparse.urlparse(url)
1919 qs = compat_parse_qs(parsed_url.query)
1920 qs.update(query)
1921 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1922 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1923
8e60dc75 1924
ed0291d1
S
1925def update_Request(req, url=None, data=None, headers={}, query={}):
1926 req_headers = req.headers.copy()
1927 req_headers.update(headers)
1928 req_data = data or req.data
1929 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
1930 req_get_method = req.get_method()
1931 if req_get_method == 'HEAD':
1932 req_type = HEADRequest
1933 elif req_get_method == 'PUT':
1934 req_type = PUTRequest
1935 else:
1936 req_type = compat_urllib_request.Request
ed0291d1
S
1937 new_req = req_type(
1938 req_url, data=req_data, headers=req_headers,
1939 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1940 if hasattr(req, 'timeout'):
1941 new_req.timeout = req.timeout
1942 return new_req
1943
1944
86296ad2 1945def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1946 if isinstance(key_or_keys, (list, tuple)):
1947 for key in key_or_keys:
86296ad2
S
1948 if key not in d or d[key] is None or skip_false_values and not d[key]:
1949 continue
1950 return d[key]
cbecc9b9
S
1951 return default
1952 return d.get(key_or_keys, default)
1953
1954
329ca3be
S
1955def try_get(src, getter, expected_type=None):
1956 try:
1957 v = getter(src)
1958 except (AttributeError, KeyError, TypeError, IndexError):
1959 pass
1960 else:
1961 if expected_type is None or isinstance(v, expected_type):
1962 return v
1963
1964
8e60dc75
S
1965def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1966 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1967
16392824 1968
a1a530b0
PH
1969US_RATINGS = {
1970 'G': 0,
1971 'PG': 10,
1972 'PG-13': 13,
1973 'R': 16,
1974 'NC': 18,
1975}
fac55558
PH
1976
1977
146c80e2
S
1978def parse_age_limit(s):
1979 if s is None:
d838b1bd 1980 return None
146c80e2 1981 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1982 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1983
1984
fac55558 1985def strip_jsonp(code):
609a61e3 1986 return re.sub(
5950cb1d 1987 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1988
1989
e05f6939
PH
1990def js_to_json(code):
1991 def fix_kv(m):
e7b6d122
PH
1992 v = m.group(0)
1993 if v in ('true', 'false', 'null'):
1994 return v
bd1e4844 1995 elif v.startswith('/*') or v == ',':
1996 return ""
1997
1998 if v[0] in ("'", '"'):
1999 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2000 '"': '\\"',
bd1e4844 2001 "\\'": "'",
2002 '\\\n': '',
2003 '\\x': '\\u00',
2004 }.get(m.group(0), m.group(0)), v[1:-1])
2005
89ac4a19 2006 INTEGER_TABLE = (
cda6d47a
S
2007 (r'^0[xX][0-9a-fA-F]+', 16),
2008 (r'^0+[0-7]+', 8),
89ac4a19
S
2009 )
2010
2011 for regex, base in INTEGER_TABLE:
2012 im = re.match(regex, v)
2013 if im:
cda6d47a 2014 i = int(im.group(0), base)
89ac4a19
S
2015 return '"%d":' % i if v.endswith(':') else '%d' % i
2016
e7b6d122 2017 return '"%s"' % v
e05f6939 2018
bd1e4844 2019 return re.sub(r'''(?sx)
2020 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2021 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2022 /\*.*?\*/|,(?=\s*[\]}])|
2023 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2024 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2025 [0-9]+(?=\s*:)
e05f6939 2026 ''', fix_kv, code)
e05f6939
PH
2027
2028
478c2c61
PH
2029def qualities(quality_ids):
2030 """ Get a numeric quality value out of a list of possible values """
2031 def q(qid):
2032 try:
2033 return quality_ids.index(qid)
2034 except ValueError:
2035 return -1
2036 return q
2037
acd69589
PH
2038
2039DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2040
a020a0dc
PH
2041
2042def limit_length(s, length):
2043 """ Add ellipses to overly long strings """
2044 if s is None:
2045 return None
2046 ELLIPSES = '...'
2047 if len(s) > length:
2048 return s[:length - len(ELLIPSES)] + ELLIPSES
2049 return s
48844745
PH
2050
2051
2052def version_tuple(v):
5f9b8394 2053 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2054
2055
2056def is_outdated_version(version, limit, assume_new=True):
2057 if not version:
2058 return not assume_new
2059 try:
2060 return version_tuple(version) < version_tuple(limit)
2061 except ValueError:
2062 return not assume_new
732ea2f0
PH
2063
2064
2065def ytdl_is_updateable():
2066 """ Returns if youtube-dl can be updated with -U """
2067 from zipimport import zipimporter
2068
2069 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2070
2071
2072def args_to_str(args):
2073 # Get a short string representation for a subprocess command
702ccf2d 2074 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2075
2076
9b9c5355 2077def error_to_compat_str(err):
fdae2358
S
2078 err_str = str(err)
2079 # On python 2 error byte string must be decoded with proper
2080 # encoding rather than ascii
2081 if sys.version_info[0] < 3:
2082 err_str = err_str.decode(preferredencoding())
2083 return err_str
2084
2085
c460bdd5 2086def mimetype2ext(mt):
eb9ee194
S
2087 if mt is None:
2088 return None
2089
765ac263
JMF
2090 ext = {
2091 'audio/mp4': 'm4a',
6c33d24b
YCH
2092 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2093 # it's the most popular one
2094 'audio/mpeg': 'mp3',
765ac263
JMF
2095 }.get(mt)
2096 if ext is not None:
2097 return ext
2098
c460bdd5 2099 _, _, res = mt.rpartition('/')
b4173f15 2100 res = res.lower()
c460bdd5
PH
2101
2102 return {
f6861ec9 2103 '3gpp': '3gp',
cafcf657 2104 'smptett+xml': 'tt',
2105 'srt': 'srt',
2106 'ttaf+xml': 'dfxp',
a0d8d704 2107 'ttml+xml': 'ttml',
cafcf657 2108 'vtt': 'vtt',
f6861ec9 2109 'x-flv': 'flv',
a0d8d704
YCH
2110 'x-mp4-fragmented': 'mp4',
2111 'x-ms-wmv': 'wmv',
b4173f15
RA
2112 'mpegurl': 'm3u8',
2113 'x-mpegurl': 'm3u8',
2114 'vnd.apple.mpegurl': 'm3u8',
2115 'dash+xml': 'mpd',
2116 'f4m': 'f4m',
2117 'f4m+xml': 'f4m',
c460bdd5
PH
2118 }.get(res, res)
2119
2120
2ccd1b10 2121def urlhandle_detect_ext(url_handle):
79298173 2122 getheader = url_handle.headers.get
2ccd1b10 2123
b55ee18f
PH
2124 cd = getheader('Content-Disposition')
2125 if cd:
2126 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2127 if m:
2128 e = determine_ext(m.group('filename'), default_ext=None)
2129 if e:
2130 return e
2131
c460bdd5 2132 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2133
2134
1e399778
YCH
2135def encode_data_uri(data, mime_type):
2136 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2137
2138
05900629 2139def age_restricted(content_limit, age_limit):
6ec6cb4e 2140 """ Returns True iff the content should be blocked """
05900629
PH
2141
2142 if age_limit is None: # No limit set
2143 return False
2144 if content_limit is None:
2145 return False # Content available for everyone
2146 return age_limit < content_limit
61ca9a80
PH
2147
2148
2149def is_html(first_bytes):
2150 """ Detect whether a file contains HTML by examining its first bytes. """
2151
2152 BOMS = [
2153 (b'\xef\xbb\xbf', 'utf-8'),
2154 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2155 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2156 (b'\xff\xfe', 'utf-16-le'),
2157 (b'\xfe\xff', 'utf-16-be'),
2158 ]
2159 for bom, enc in BOMS:
2160 if first_bytes.startswith(bom):
2161 s = first_bytes[len(bom):].decode(enc, 'replace')
2162 break
2163 else:
2164 s = first_bytes.decode('utf-8', 'replace')
2165
2166 return re.match(r'^\s*<', s)
a055469f
PH
2167
2168
2169def determine_protocol(info_dict):
2170 protocol = info_dict.get('protocol')
2171 if protocol is not None:
2172 return protocol
2173
2174 url = info_dict['url']
2175 if url.startswith('rtmp'):
2176 return 'rtmp'
2177 elif url.startswith('mms'):
2178 return 'mms'
2179 elif url.startswith('rtsp'):
2180 return 'rtsp'
2181
2182 ext = determine_ext(url)
2183 if ext == 'm3u8':
2184 return 'm3u8'
2185 elif ext == 'f4m':
2186 return 'f4m'
2187
2188 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2189
2190
2191def render_table(header_row, data):
2192 """ Render a list of rows, each as a list of values """
2193 table = [header_row] + data
2194 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2195 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2196 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2197
2198
2199def _match_one(filter_part, dct):
2200 COMPARISON_OPERATORS = {
2201 '<': operator.lt,
2202 '<=': operator.le,
2203 '>': operator.gt,
2204 '>=': operator.ge,
2205 '=': operator.eq,
2206 '!=': operator.ne,
2207 }
2208 operator_rex = re.compile(r'''(?x)\s*
2209 (?P<key>[a-z_]+)
2210 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2211 (?:
2212 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2213 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2214 )
2215 \s*$
2216 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2217 m = operator_rex.search(filter_part)
2218 if m:
2219 op = COMPARISON_OPERATORS[m.group('op')]
2220 if m.group('strval') is not None:
2221 if m.group('op') not in ('=', '!='):
2222 raise ValueError(
2223 'Operator %s does not support string values!' % m.group('op'))
2224 comparison_value = m.group('strval')
2225 else:
2226 try:
2227 comparison_value = int(m.group('intval'))
2228 except ValueError:
2229 comparison_value = parse_filesize(m.group('intval'))
2230 if comparison_value is None:
2231 comparison_value = parse_filesize(m.group('intval') + 'B')
2232 if comparison_value is None:
2233 raise ValueError(
2234 'Invalid integer value %r in filter part %r' % (
2235 m.group('intval'), filter_part))
2236 actual_value = dct.get(m.group('key'))
2237 if actual_value is None:
2238 return m.group('none_inclusive')
2239 return op(actual_value, comparison_value)
2240
2241 UNARY_OPERATORS = {
2242 '': lambda v: v is not None,
2243 '!': lambda v: v is None,
2244 }
2245 operator_rex = re.compile(r'''(?x)\s*
2246 (?P<op>%s)\s*(?P<key>[a-z_]+)
2247 \s*$
2248 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2249 m = operator_rex.search(filter_part)
2250 if m:
2251 op = UNARY_OPERATORS[m.group('op')]
2252 actual_value = dct.get(m.group('key'))
2253 return op(actual_value)
2254
2255 raise ValueError('Invalid filter part %r' % filter_part)
2256
2257
2258def match_str(filter_str, dct):
2259 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2260
2261 return all(
2262 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2263
2264
2265def match_filter_func(filter_str):
2266 def _match_func(info_dict):
2267 if match_str(filter_str, info_dict):
2268 return None
2269 else:
2270 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2271 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2272 return _match_func
91410c9b
PH
2273
2274
bf6427d2
YCH
2275def parse_dfxp_time_expr(time_expr):
2276 if not time_expr:
d631d5f9 2277 return
bf6427d2
YCH
2278
2279 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2280 if mobj:
2281 return float(mobj.group('time_offset'))
2282
db2fe38b 2283 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2284 if mobj:
db2fe38b 2285 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2286
2287
c1c924ab
YCH
2288def srt_subtitles_timecode(seconds):
2289 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2290
2291
2292def dfxp2srt(dfxp_data):
4e335771
YCH
2293 _x = functools.partial(xpath_with_ns, ns_map={
2294 'ttml': 'http://www.w3.org/ns/ttml',
2295 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2296 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2297 })
bf6427d2 2298
87de7069 2299 class TTMLPElementParser(object):
2b14cb56 2300 out = ''
bf6427d2 2301
2b14cb56 2302 def start(self, tag, attrib):
2303 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2304 self.out += '\n'
bf6427d2 2305
2b14cb56 2306 def end(self, tag):
2307 pass
bf6427d2 2308
2b14cb56 2309 def data(self, data):
2310 self.out += data
2311
2312 def close(self):
2313 return self.out.strip()
2314
2315 def parse_node(node):
2316 target = TTMLPElementParser()
2317 parser = xml.etree.ElementTree.XMLParser(target=target)
2318 parser.feed(xml.etree.ElementTree.tostring(node))
2319 return parser.close()
bf6427d2 2320
36e6f62c 2321 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2322 out = []
5bf28d78 2323 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2324
2325 if not paras:
2326 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2327
2328 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2329 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2330 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2331 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2332 if begin_time is None:
2333 continue
7dff0363 2334 if not end_time:
d631d5f9
YCH
2335 if not dur:
2336 continue
2337 end_time = begin_time + dur
bf6427d2
YCH
2338 out.append('%d\n%s --> %s\n%s\n\n' % (
2339 index,
c1c924ab
YCH
2340 srt_subtitles_timecode(begin_time),
2341 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2342 parse_node(para)))
2343
2344 return ''.join(out)
2345
2346
66e289ba
S
2347def cli_option(params, command_option, param):
2348 param = params.get(param)
2349 return [command_option, param] if param is not None else []
2350
2351
2352def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2353 param = params.get(param)
2354 assert isinstance(param, bool)
2355 if separator:
2356 return [command_option + separator + (true_value if param else false_value)]
2357 return [command_option, true_value if param else false_value]
2358
2359
2360def cli_valueless_option(params, command_option, param, expected_value=True):
2361 param = params.get(param)
2362 return [command_option] if param == expected_value else []
2363
2364
2365def cli_configuration_args(params, param, default=[]):
2366 ex_args = params.get(param)
2367 if ex_args is None:
2368 return default
2369 assert isinstance(ex_args, list)
2370 return ex_args
2371
2372
39672624
YCH
2373class ISO639Utils(object):
2374 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2375 _lang_map = {
2376 'aa': 'aar',
2377 'ab': 'abk',
2378 'ae': 'ave',
2379 'af': 'afr',
2380 'ak': 'aka',
2381 'am': 'amh',
2382 'an': 'arg',
2383 'ar': 'ara',
2384 'as': 'asm',
2385 'av': 'ava',
2386 'ay': 'aym',
2387 'az': 'aze',
2388 'ba': 'bak',
2389 'be': 'bel',
2390 'bg': 'bul',
2391 'bh': 'bih',
2392 'bi': 'bis',
2393 'bm': 'bam',
2394 'bn': 'ben',
2395 'bo': 'bod',
2396 'br': 'bre',
2397 'bs': 'bos',
2398 'ca': 'cat',
2399 'ce': 'che',
2400 'ch': 'cha',
2401 'co': 'cos',
2402 'cr': 'cre',
2403 'cs': 'ces',
2404 'cu': 'chu',
2405 'cv': 'chv',
2406 'cy': 'cym',
2407 'da': 'dan',
2408 'de': 'deu',
2409 'dv': 'div',
2410 'dz': 'dzo',
2411 'ee': 'ewe',
2412 'el': 'ell',
2413 'en': 'eng',
2414 'eo': 'epo',
2415 'es': 'spa',
2416 'et': 'est',
2417 'eu': 'eus',
2418 'fa': 'fas',
2419 'ff': 'ful',
2420 'fi': 'fin',
2421 'fj': 'fij',
2422 'fo': 'fao',
2423 'fr': 'fra',
2424 'fy': 'fry',
2425 'ga': 'gle',
2426 'gd': 'gla',
2427 'gl': 'glg',
2428 'gn': 'grn',
2429 'gu': 'guj',
2430 'gv': 'glv',
2431 'ha': 'hau',
2432 'he': 'heb',
2433 'hi': 'hin',
2434 'ho': 'hmo',
2435 'hr': 'hrv',
2436 'ht': 'hat',
2437 'hu': 'hun',
2438 'hy': 'hye',
2439 'hz': 'her',
2440 'ia': 'ina',
2441 'id': 'ind',
2442 'ie': 'ile',
2443 'ig': 'ibo',
2444 'ii': 'iii',
2445 'ik': 'ipk',
2446 'io': 'ido',
2447 'is': 'isl',
2448 'it': 'ita',
2449 'iu': 'iku',
2450 'ja': 'jpn',
2451 'jv': 'jav',
2452 'ka': 'kat',
2453 'kg': 'kon',
2454 'ki': 'kik',
2455 'kj': 'kua',
2456 'kk': 'kaz',
2457 'kl': 'kal',
2458 'km': 'khm',
2459 'kn': 'kan',
2460 'ko': 'kor',
2461 'kr': 'kau',
2462 'ks': 'kas',
2463 'ku': 'kur',
2464 'kv': 'kom',
2465 'kw': 'cor',
2466 'ky': 'kir',
2467 'la': 'lat',
2468 'lb': 'ltz',
2469 'lg': 'lug',
2470 'li': 'lim',
2471 'ln': 'lin',
2472 'lo': 'lao',
2473 'lt': 'lit',
2474 'lu': 'lub',
2475 'lv': 'lav',
2476 'mg': 'mlg',
2477 'mh': 'mah',
2478 'mi': 'mri',
2479 'mk': 'mkd',
2480 'ml': 'mal',
2481 'mn': 'mon',
2482 'mr': 'mar',
2483 'ms': 'msa',
2484 'mt': 'mlt',
2485 'my': 'mya',
2486 'na': 'nau',
2487 'nb': 'nob',
2488 'nd': 'nde',
2489 'ne': 'nep',
2490 'ng': 'ndo',
2491 'nl': 'nld',
2492 'nn': 'nno',
2493 'no': 'nor',
2494 'nr': 'nbl',
2495 'nv': 'nav',
2496 'ny': 'nya',
2497 'oc': 'oci',
2498 'oj': 'oji',
2499 'om': 'orm',
2500 'or': 'ori',
2501 'os': 'oss',
2502 'pa': 'pan',
2503 'pi': 'pli',
2504 'pl': 'pol',
2505 'ps': 'pus',
2506 'pt': 'por',
2507 'qu': 'que',
2508 'rm': 'roh',
2509 'rn': 'run',
2510 'ro': 'ron',
2511 'ru': 'rus',
2512 'rw': 'kin',
2513 'sa': 'san',
2514 'sc': 'srd',
2515 'sd': 'snd',
2516 'se': 'sme',
2517 'sg': 'sag',
2518 'si': 'sin',
2519 'sk': 'slk',
2520 'sl': 'slv',
2521 'sm': 'smo',
2522 'sn': 'sna',
2523 'so': 'som',
2524 'sq': 'sqi',
2525 'sr': 'srp',
2526 'ss': 'ssw',
2527 'st': 'sot',
2528 'su': 'sun',
2529 'sv': 'swe',
2530 'sw': 'swa',
2531 'ta': 'tam',
2532 'te': 'tel',
2533 'tg': 'tgk',
2534 'th': 'tha',
2535 'ti': 'tir',
2536 'tk': 'tuk',
2537 'tl': 'tgl',
2538 'tn': 'tsn',
2539 'to': 'ton',
2540 'tr': 'tur',
2541 'ts': 'tso',
2542 'tt': 'tat',
2543 'tw': 'twi',
2544 'ty': 'tah',
2545 'ug': 'uig',
2546 'uk': 'ukr',
2547 'ur': 'urd',
2548 'uz': 'uzb',
2549 've': 'ven',
2550 'vi': 'vie',
2551 'vo': 'vol',
2552 'wa': 'wln',
2553 'wo': 'wol',
2554 'xh': 'xho',
2555 'yi': 'yid',
2556 'yo': 'yor',
2557 'za': 'zha',
2558 'zh': 'zho',
2559 'zu': 'zul',
2560 }
2561
2562 @classmethod
2563 def short2long(cls, code):
2564 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2565 return cls._lang_map.get(code[:2])
2566
2567 @classmethod
2568 def long2short(cls, code):
2569 """Convert language code from ISO 639-2/T to ISO 639-1"""
2570 for short_name, long_name in cls._lang_map.items():
2571 if long_name == code:
2572 return short_name
2573
2574
4eb10f66
YCH
2575class ISO3166Utils(object):
2576 # From http://data.okfn.org/data/core/country-list
2577 _country_map = {
2578 'AF': 'Afghanistan',
2579 'AX': 'Åland Islands',
2580 'AL': 'Albania',
2581 'DZ': 'Algeria',
2582 'AS': 'American Samoa',
2583 'AD': 'Andorra',
2584 'AO': 'Angola',
2585 'AI': 'Anguilla',
2586 'AQ': 'Antarctica',
2587 'AG': 'Antigua and Barbuda',
2588 'AR': 'Argentina',
2589 'AM': 'Armenia',
2590 'AW': 'Aruba',
2591 'AU': 'Australia',
2592 'AT': 'Austria',
2593 'AZ': 'Azerbaijan',
2594 'BS': 'Bahamas',
2595 'BH': 'Bahrain',
2596 'BD': 'Bangladesh',
2597 'BB': 'Barbados',
2598 'BY': 'Belarus',
2599 'BE': 'Belgium',
2600 'BZ': 'Belize',
2601 'BJ': 'Benin',
2602 'BM': 'Bermuda',
2603 'BT': 'Bhutan',
2604 'BO': 'Bolivia, Plurinational State of',
2605 'BQ': 'Bonaire, Sint Eustatius and Saba',
2606 'BA': 'Bosnia and Herzegovina',
2607 'BW': 'Botswana',
2608 'BV': 'Bouvet Island',
2609 'BR': 'Brazil',
2610 'IO': 'British Indian Ocean Territory',
2611 'BN': 'Brunei Darussalam',
2612 'BG': 'Bulgaria',
2613 'BF': 'Burkina Faso',
2614 'BI': 'Burundi',
2615 'KH': 'Cambodia',
2616 'CM': 'Cameroon',
2617 'CA': 'Canada',
2618 'CV': 'Cape Verde',
2619 'KY': 'Cayman Islands',
2620 'CF': 'Central African Republic',
2621 'TD': 'Chad',
2622 'CL': 'Chile',
2623 'CN': 'China',
2624 'CX': 'Christmas Island',
2625 'CC': 'Cocos (Keeling) Islands',
2626 'CO': 'Colombia',
2627 'KM': 'Comoros',
2628 'CG': 'Congo',
2629 'CD': 'Congo, the Democratic Republic of the',
2630 'CK': 'Cook Islands',
2631 'CR': 'Costa Rica',
2632 'CI': 'Côte d\'Ivoire',
2633 'HR': 'Croatia',
2634 'CU': 'Cuba',
2635 'CW': 'Curaçao',
2636 'CY': 'Cyprus',
2637 'CZ': 'Czech Republic',
2638 'DK': 'Denmark',
2639 'DJ': 'Djibouti',
2640 'DM': 'Dominica',
2641 'DO': 'Dominican Republic',
2642 'EC': 'Ecuador',
2643 'EG': 'Egypt',
2644 'SV': 'El Salvador',
2645 'GQ': 'Equatorial Guinea',
2646 'ER': 'Eritrea',
2647 'EE': 'Estonia',
2648 'ET': 'Ethiopia',
2649 'FK': 'Falkland Islands (Malvinas)',
2650 'FO': 'Faroe Islands',
2651 'FJ': 'Fiji',
2652 'FI': 'Finland',
2653 'FR': 'France',
2654 'GF': 'French Guiana',
2655 'PF': 'French Polynesia',
2656 'TF': 'French Southern Territories',
2657 'GA': 'Gabon',
2658 'GM': 'Gambia',
2659 'GE': 'Georgia',
2660 'DE': 'Germany',
2661 'GH': 'Ghana',
2662 'GI': 'Gibraltar',
2663 'GR': 'Greece',
2664 'GL': 'Greenland',
2665 'GD': 'Grenada',
2666 'GP': 'Guadeloupe',
2667 'GU': 'Guam',
2668 'GT': 'Guatemala',
2669 'GG': 'Guernsey',
2670 'GN': 'Guinea',
2671 'GW': 'Guinea-Bissau',
2672 'GY': 'Guyana',
2673 'HT': 'Haiti',
2674 'HM': 'Heard Island and McDonald Islands',
2675 'VA': 'Holy See (Vatican City State)',
2676 'HN': 'Honduras',
2677 'HK': 'Hong Kong',
2678 'HU': 'Hungary',
2679 'IS': 'Iceland',
2680 'IN': 'India',
2681 'ID': 'Indonesia',
2682 'IR': 'Iran, Islamic Republic of',
2683 'IQ': 'Iraq',
2684 'IE': 'Ireland',
2685 'IM': 'Isle of Man',
2686 'IL': 'Israel',
2687 'IT': 'Italy',
2688 'JM': 'Jamaica',
2689 'JP': 'Japan',
2690 'JE': 'Jersey',
2691 'JO': 'Jordan',
2692 'KZ': 'Kazakhstan',
2693 'KE': 'Kenya',
2694 'KI': 'Kiribati',
2695 'KP': 'Korea, Democratic People\'s Republic of',
2696 'KR': 'Korea, Republic of',
2697 'KW': 'Kuwait',
2698 'KG': 'Kyrgyzstan',
2699 'LA': 'Lao People\'s Democratic Republic',
2700 'LV': 'Latvia',
2701 'LB': 'Lebanon',
2702 'LS': 'Lesotho',
2703 'LR': 'Liberia',
2704 'LY': 'Libya',
2705 'LI': 'Liechtenstein',
2706 'LT': 'Lithuania',
2707 'LU': 'Luxembourg',
2708 'MO': 'Macao',
2709 'MK': 'Macedonia, the Former Yugoslav Republic of',
2710 'MG': 'Madagascar',
2711 'MW': 'Malawi',
2712 'MY': 'Malaysia',
2713 'MV': 'Maldives',
2714 'ML': 'Mali',
2715 'MT': 'Malta',
2716 'MH': 'Marshall Islands',
2717 'MQ': 'Martinique',
2718 'MR': 'Mauritania',
2719 'MU': 'Mauritius',
2720 'YT': 'Mayotte',
2721 'MX': 'Mexico',
2722 'FM': 'Micronesia, Federated States of',
2723 'MD': 'Moldova, Republic of',
2724 'MC': 'Monaco',
2725 'MN': 'Mongolia',
2726 'ME': 'Montenegro',
2727 'MS': 'Montserrat',
2728 'MA': 'Morocco',
2729 'MZ': 'Mozambique',
2730 'MM': 'Myanmar',
2731 'NA': 'Namibia',
2732 'NR': 'Nauru',
2733 'NP': 'Nepal',
2734 'NL': 'Netherlands',
2735 'NC': 'New Caledonia',
2736 'NZ': 'New Zealand',
2737 'NI': 'Nicaragua',
2738 'NE': 'Niger',
2739 'NG': 'Nigeria',
2740 'NU': 'Niue',
2741 'NF': 'Norfolk Island',
2742 'MP': 'Northern Mariana Islands',
2743 'NO': 'Norway',
2744 'OM': 'Oman',
2745 'PK': 'Pakistan',
2746 'PW': 'Palau',
2747 'PS': 'Palestine, State of',
2748 'PA': 'Panama',
2749 'PG': 'Papua New Guinea',
2750 'PY': 'Paraguay',
2751 'PE': 'Peru',
2752 'PH': 'Philippines',
2753 'PN': 'Pitcairn',
2754 'PL': 'Poland',
2755 'PT': 'Portugal',
2756 'PR': 'Puerto Rico',
2757 'QA': 'Qatar',
2758 'RE': 'Réunion',
2759 'RO': 'Romania',
2760 'RU': 'Russian Federation',
2761 'RW': 'Rwanda',
2762 'BL': 'Saint Barthélemy',
2763 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2764 'KN': 'Saint Kitts and Nevis',
2765 'LC': 'Saint Lucia',
2766 'MF': 'Saint Martin (French part)',
2767 'PM': 'Saint Pierre and Miquelon',
2768 'VC': 'Saint Vincent and the Grenadines',
2769 'WS': 'Samoa',
2770 'SM': 'San Marino',
2771 'ST': 'Sao Tome and Principe',
2772 'SA': 'Saudi Arabia',
2773 'SN': 'Senegal',
2774 'RS': 'Serbia',
2775 'SC': 'Seychelles',
2776 'SL': 'Sierra Leone',
2777 'SG': 'Singapore',
2778 'SX': 'Sint Maarten (Dutch part)',
2779 'SK': 'Slovakia',
2780 'SI': 'Slovenia',
2781 'SB': 'Solomon Islands',
2782 'SO': 'Somalia',
2783 'ZA': 'South Africa',
2784 'GS': 'South Georgia and the South Sandwich Islands',
2785 'SS': 'South Sudan',
2786 'ES': 'Spain',
2787 'LK': 'Sri Lanka',
2788 'SD': 'Sudan',
2789 'SR': 'Suriname',
2790 'SJ': 'Svalbard and Jan Mayen',
2791 'SZ': 'Swaziland',
2792 'SE': 'Sweden',
2793 'CH': 'Switzerland',
2794 'SY': 'Syrian Arab Republic',
2795 'TW': 'Taiwan, Province of China',
2796 'TJ': 'Tajikistan',
2797 'TZ': 'Tanzania, United Republic of',
2798 'TH': 'Thailand',
2799 'TL': 'Timor-Leste',
2800 'TG': 'Togo',
2801 'TK': 'Tokelau',
2802 'TO': 'Tonga',
2803 'TT': 'Trinidad and Tobago',
2804 'TN': 'Tunisia',
2805 'TR': 'Turkey',
2806 'TM': 'Turkmenistan',
2807 'TC': 'Turks and Caicos Islands',
2808 'TV': 'Tuvalu',
2809 'UG': 'Uganda',
2810 'UA': 'Ukraine',
2811 'AE': 'United Arab Emirates',
2812 'GB': 'United Kingdom',
2813 'US': 'United States',
2814 'UM': 'United States Minor Outlying Islands',
2815 'UY': 'Uruguay',
2816 'UZ': 'Uzbekistan',
2817 'VU': 'Vanuatu',
2818 'VE': 'Venezuela, Bolivarian Republic of',
2819 'VN': 'Viet Nam',
2820 'VG': 'Virgin Islands, British',
2821 'VI': 'Virgin Islands, U.S.',
2822 'WF': 'Wallis and Futuna',
2823 'EH': 'Western Sahara',
2824 'YE': 'Yemen',
2825 'ZM': 'Zambia',
2826 'ZW': 'Zimbabwe',
2827 }
2828
2829 @classmethod
2830 def short2full(cls, code):
2831 """Convert an ISO 3166-2 country code to the corresponding full name"""
2832 return cls._country_map.get(code.upper())
2833
2834
91410c9b 2835class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2836 def __init__(self, proxies=None):
2837 # Set default handlers
2838 for type in ('http', 'https'):
2839 setattr(self, '%s_open' % type,
2840 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2841 meth(r, proxy, type))
2842 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2843
91410c9b 2844 def proxy_open(self, req, proxy, type):
2461f79d 2845 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2846 if req_proxy is not None:
2847 proxy = req_proxy
2461f79d
PH
2848 del req.headers['Ytdl-request-proxy']
2849
2850 if proxy == '__noproxy__':
2851 return None # No Proxy
51fb4995 2852 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2853 req.add_header('Ytdl-socks-proxy', proxy)
2854 # youtube-dl's http/https handlers do wrapping the socket with socks
2855 return None
91410c9b
PH
2856 return compat_urllib_request.ProxyHandler.proxy_open(
2857 self, req, proxy, type)
5bc880b9
YCH
2858
2859
2860def ohdave_rsa_encrypt(data, exponent, modulus):
2861 '''
2862 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2863
2864 Input:
2865 data: data to encrypt, bytes-like object
2866 exponent, modulus: parameter e and N of RSA algorithm, both integer
2867 Output: hex string of encrypted data
2868
2869 Limitation: supports one block encryption only
2870 '''
2871
2872 payload = int(binascii.hexlify(data[::-1]), 16)
2873 encrypted = pow(payload, exponent, modulus)
2874 return '%x' % encrypted
81bdc8fd
YCH
2875
2876
5eb6bdce 2877def encode_base_n(num, n, table=None):
59f898b7 2878 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2879 if not table:
2880 table = FULL_TABLE[:n]
2881
5eb6bdce
YCH
2882 if n > len(table):
2883 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2884
2885 if num == 0:
2886 return table[0]
2887
81bdc8fd
YCH
2888 ret = ''
2889 while num:
2890 ret = table[num % n] + ret
2891 num = num // n
2892 return ret
f52354a8
YCH
2893
2894
2895def decode_packed_codes(code):
2896 mobj = re.search(
680079be 2897 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2898 code)
2899 obfucasted_code, base, count, symbols = mobj.groups()
2900 base = int(base)
2901 count = int(count)
2902 symbols = symbols.split('|')
2903 symbol_table = {}
2904
2905 while count:
2906 count -= 1
5eb6bdce 2907 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2908 symbol_table[base_n_count] = symbols[count] or base_n_count
2909
2910 return re.sub(
2911 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2912 obfucasted_code)
e154c651 2913
2914
2915def parse_m3u8_attributes(attrib):
2916 info = {}
2917 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2918 if val.startswith('"'):
2919 val = val[1:-1]
2920 info[key] = val
2921 return info
1143535d
YCH
2922
2923
2924def urshift(val, n):
2925 return val >> n if val >= 0 else (val + 0x100000000) >> n