]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Add unified_timestamp
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
702ccf2d 46 compat_shlex_quote,
be4a824d 47 compat_socket_create_connection,
8c25f81b 48 compat_str,
edaa23f8 49 compat_struct_pack,
8c25f81b
PH
50 compat_urllib_error,
51 compat_urllib_parse,
15707c7e 52 compat_urllib_parse_urlencode,
8c25f81b 53 compat_urllib_parse_urlparse,
7581bfc9 54 compat_urllib_parse_unquote_plus,
8c25f81b
PH
55 compat_urllib_request,
56 compat_urlparse,
810c10ba 57 compat_xpath,
8c25f81b 58)
4644ac55 59
71aff188
YCH
60from .socks import (
61 ProxyType,
62 sockssocket,
63)
64
4644ac55 65
51fb4995
YCH
66def register_socks_protocols():
67 # "Register" SOCKS protocols
d5ae6bb5
YCH
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
468e2e92
FV
75# This is not clearly defined otherwise
76compiled_regex_type = type(re.compile(''))
77
3e669f36 78std_headers = {
15d10678 79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 84}
f427df17 85
5f6a1245 86
bf42a990
S
87NO_DEFAULT = object()
88
7105440c
YCH
89ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
a7aaa398
S
93KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
c587cbb7 108# needed for sanitizing filenames in restricted mode
c8827027 109ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 112
46f59e89
S
113DATE_FORMATS = (
114 '%d %B %Y',
115 '%d %b %Y',
116 '%B %d %Y',
117 '%b %d %Y',
118 '%b %dst %Y %I:%M',
119 '%b %dnd %Y %I:%M',
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
123 '%Y/%m/%d',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y-%m-%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S.%f',
127 '%d.%m.%Y %H:%M',
128 '%d.%m.%Y %H.%M',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
132 '%Y-%m-%dT%H:%M:%S',
133 '%Y-%m-%dT%H:%M:%S.%f',
134 '%Y-%m-%dT%H:%M',
135)
136
137DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138DATE_FORMATS_DAY_FIRST.extend([
139 '%d-%m-%Y',
140 '%d.%m.%Y',
141 '%d.%m.%y',
142 '%d/%m/%Y',
143 '%d/%m/%y',
144 '%d/%m/%Y %H:%M:%S',
145])
146
147DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_MONTH_FIRST.extend([
149 '%m-%d-%Y',
150 '%m.%d.%Y',
151 '%m/%d/%Y',
152 '%m/%d/%y',
153 '%m/%d/%Y %H:%M:%S',
154])
155
7105440c 156
d77c3dfd 157def preferredencoding():
59ae15a5 158 """Get preferred encoding.
d77c3dfd 159
59ae15a5
PH
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
28e614de 165 'TEST'.encode(pref)
70a1165b 166 except Exception:
59ae15a5 167 pref = 'UTF-8'
bae611f2 168
59ae15a5 169 return pref
d77c3dfd 170
f4bfd65f 171
181c8655 172def write_json_file(obj, fn):
1394646a 173 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 174
92120217 175 fn = encodeFilename(fn)
61ee5aeb 176 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
184 else:
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
187
73159f99
S
188 args = {
189 'suffix': '.tmp',
ec5f6016
JMF
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
73159f99
S
192 'delete': False,
193 }
194
181c8655
PH
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
73159f99 198 args['mode'] = 'wb'
181c8655 199 else:
73159f99
S
200 args.update({
201 'mode': 'w',
202 'encoding': 'utf-8',
203 })
204
c86b6142 205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
206
207 try:
208 with tf:
209 json.dump(obj, tf)
1394646a
IK
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
213 try:
214 os.unlink(fn)
215 except OSError:
216 pass
181c8655 217 os.rename(tf.name, fn)
70a1165b 218 except Exception:
181c8655
PH
219 try:
220 os.remove(tf.name)
221 except OSError:
222 pass
223 raise
224
225
226if sys.version_info >= (2, 7):
ee114368 227 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 228 """ Find the xpath xpath[@key=val] """
5d2354f1 229 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
231 return node.find(expr)
232else:
ee114368 233 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 234 for f in node.findall(compat_xpath(xpath)):
ee114368
S
235 if key not in f.attrib:
236 continue
237 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
238 return f
239 return None
240
d7e66d39
JMF
241# On python2.6 the xml.etree.ElementTree.Element methods don't support
242# the namespace parameter
5f6a1245
JW
243
244
d7e66d39
JMF
245def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
247 replaced = []
248 for c in components:
249 if len(c) == 1:
250 replaced.append(c[0])
251 else:
252 ns, tag = c
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
255
d77c3dfd 256
a41fb80c 257def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 258 def _find_xpath(xpath):
810c10ba 259 return node.find(compat_xpath(xpath))
578c0745
S
260
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
263 else:
264 for xp in xpath:
265 n = _find_xpath(xp)
266 if n is not None:
267 break
d74bebd5 268
8e636da4 269 if n is None:
bf42a990
S
270 if default is not NO_DEFAULT:
271 return default
272 elif fatal:
bf0ff932
PH
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
275 else:
276 return None
a41fb80c
S
277 return n
278
279
280def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
283 return n
284 if n.text is None:
285 if default is not NO_DEFAULT:
286 return default
287 elif fatal:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
290 else:
291 return None
292 return n.text
a41fb80c
S
293
294
295def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
297 if n is None:
298 if default is not NO_DEFAULT:
299 return default
300 elif fatal:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
303 else:
304 return None
305 return n.attrib[key]
bf0ff932
PH
306
307
9e6dd238 308def get_element_by_id(id, html):
43e8fafd 309 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 310 return get_element_by_attribute('id', id, html)
43e8fafd 311
12ea2f30 312
43e8fafd
ND
313def get_element_by_attribute(attribute, value, html):
314 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 315
38285056
PH
316 m = re.search(r'''(?xs)
317 <([a-zA-Z0-9:._-]+)
abc97b5e 318 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 319 \s+%s=['"]?%s['"]?
abc97b5e 320 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
321 \s*>
322 (?P<content>.*?)
323 </\1>
324 ''' % (re.escape(attribute), re.escape(value)), html)
325
326 if not m:
327 return None
328 res = m.group('content')
329
330 if res.startswith('"') or res.startswith("'"):
331 res = res[1:-1]
a921f407 332
38285056 333 return unescapeHTML(res)
a921f407 334
c5229f39 335
8bb56eee
BF
336class HTMLAttributeParser(compat_HTMLParser):
337 """Trivial HTML parser to gather the attributes for a single element"""
338 def __init__(self):
c5229f39 339 self.attrs = {}
8bb56eee
BF
340 compat_HTMLParser.__init__(self)
341
342 def handle_starttag(self, tag, attrs):
343 self.attrs = dict(attrs)
344
c5229f39 345
8bb56eee
BF
346def extract_attributes(html_element):
347 """Given a string for an HTML element such as
348 <el
349 a="foo" B="bar" c="&98;az" d=boz
350 empty= noval entity="&amp;"
351 sq='"' dq="'"
352 >
353 Decode and return a dictionary of attributes.
354 {
355 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
356 'empty': '', 'noval': None, 'entity': '&',
357 'sq': '"', 'dq': '\''
358 }.
359 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
360 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
361 """
362 parser = HTMLAttributeParser()
363 parser.feed(html_element)
364 parser.close()
365 return parser.attrs
9e6dd238 366
c5229f39 367
9e6dd238 368def clean_html(html):
59ae15a5 369 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
370
371 if html is None: # Convenience for sanitizing descriptions etc.
372 return html
373
59ae15a5
PH
374 # Newline vs <br />
375 html = html.replace('\n', ' ')
6b3aef80
FV
376 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
377 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
378 # Strip html tags
379 html = re.sub('<.*?>', '', html)
380 # Replace html entities
381 html = unescapeHTML(html)
7decf895 382 return html.strip()
9e6dd238
FV
383
384
d77c3dfd 385def sanitize_open(filename, open_mode):
59ae15a5
PH
386 """Try to open the given filename, and slightly tweak it if this fails.
387
388 Attempts to open the given filename. If this fails, it tries to change
389 the filename slightly, step by step, until it's either able to open it
390 or it fails and raises a final exception, like the standard open()
391 function.
392
393 It returns the tuple (stream, definitive_file_name).
394 """
395 try:
28e614de 396 if filename == '-':
59ae15a5
PH
397 if sys.platform == 'win32':
398 import msvcrt
399 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 400 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
401 stream = open(encodeFilename(filename), open_mode)
402 return (stream, filename)
403 except (IOError, OSError) as err:
f45c185f
PH
404 if err.errno in (errno.EACCES,):
405 raise
59ae15a5 406
f45c185f 407 # In case of error, try to remove win32 forbidden chars
d55de57b 408 alt_filename = sanitize_path(filename)
f45c185f
PH
409 if alt_filename == filename:
410 raise
411 else:
412 # An exception here should be caught in the caller
d55de57b 413 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 414 return (stream, alt_filename)
d77c3dfd
FV
415
416
417def timeconvert(timestr):
59ae15a5
PH
418 """Convert RFC 2822 defined time string into system timestamp"""
419 timestamp = None
420 timetuple = email.utils.parsedate_tz(timestr)
421 if timetuple is not None:
422 timestamp = email.utils.mktime_tz(timetuple)
423 return timestamp
1c469a94 424
5f6a1245 425
796173d0 426def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
427 """Sanitizes a string so it could be used as part of a filename.
428 If restricted is set, use a stricter subset of allowed characters.
796173d0 429 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
430 """
431 def replace_insane(char):
c587cbb7
AT
432 if restricted and char in ACCENT_CHARS:
433 return ACCENT_CHARS[char]
59ae15a5
PH
434 if char == '?' or ord(char) < 32 or ord(char) == 127:
435 return ''
436 elif char == '"':
437 return '' if restricted else '\''
438 elif char == ':':
439 return '_-' if restricted else ' -'
440 elif char in '\\/|*<>':
441 return '_'
627dcfff 442 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
443 return '_'
444 if restricted and ord(char) > 127:
445 return '_'
446 return char
447
2aeb06d6
PH
448 # Handle timestamps
449 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 450 result = ''.join(map(replace_insane, s))
796173d0
PH
451 if not is_id:
452 while '__' in result:
453 result = result.replace('__', '_')
454 result = result.strip('_')
455 # Common case of "Foreign band name - English song title"
456 if restricted and result.startswith('-_'):
457 result = result[2:]
5a42414b
PH
458 if result.startswith('-'):
459 result = '_' + result[len('-'):]
a7440261 460 result = result.lstrip('.')
796173d0
PH
461 if not result:
462 result = '_'
59ae15a5 463 return result
d77c3dfd 464
5f6a1245 465
a2aaf4db
S
466def sanitize_path(s):
467 """Sanitizes and normalizes path on Windows"""
468 if sys.platform != 'win32':
469 return s
be531ef1
S
470 drive_or_unc, _ = os.path.splitdrive(s)
471 if sys.version_info < (2, 7) and not drive_or_unc:
472 drive_or_unc, _ = os.path.splitunc(s)
473 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
474 if drive_or_unc:
a2aaf4db
S
475 norm_path.pop(0)
476 sanitized_path = [
c90d16cf 477 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 478 for path_part in norm_path]
be531ef1
S
479 if drive_or_unc:
480 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
481 return os.path.join(*sanitized_path)
482
483
67dda517
S
484# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
485# unwanted failures due to missing protocol
17bcc626
S
486def sanitize_url(url):
487 return 'http:%s' % url if url.startswith('//') else url
488
489
67dda517 490def sanitized_Request(url, *args, **kwargs):
17bcc626 491 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
492
493
d77c3dfd 494def orderedSet(iterable):
59ae15a5
PH
495 """ Remove all duplicates from the input iterable """
496 res = []
497 for el in iterable:
498 if el not in res:
499 res.append(el)
500 return res
d77c3dfd 501
912b38b4 502
55b2f099 503def _htmlentity_transform(entity_with_semicolon):
4e408e47 504 """Transforms an HTML entity to a character."""
55b2f099
YCH
505 entity = entity_with_semicolon[:-1]
506
4e408e47
PH
507 # Known non-numeric HTML entity
508 if entity in compat_html_entities.name2codepoint:
509 return compat_chr(compat_html_entities.name2codepoint[entity])
510
55b2f099
YCH
511 # TODO: HTML5 allows entities without a semicolon. For example,
512 # '&Eacuteric' should be decoded as 'Éric'.
513 if entity_with_semicolon in compat_html_entities_html5:
514 return compat_html_entities_html5[entity_with_semicolon]
515
91757b0f 516 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
517 if mobj is not None:
518 numstr = mobj.group(1)
28e614de 519 if numstr.startswith('x'):
4e408e47 520 base = 16
28e614de 521 numstr = '0%s' % numstr
4e408e47
PH
522 else:
523 base = 10
7aefc49c
S
524 # See https://github.com/rg3/youtube-dl/issues/7518
525 try:
526 return compat_chr(int(numstr, base))
527 except ValueError:
528 pass
4e408e47
PH
529
530 # Unknown entity in name, return its literal representation
7a3f0c00 531 return '&%s;' % entity
4e408e47
PH
532
533
d77c3dfd 534def unescapeHTML(s):
912b38b4
PH
535 if s is None:
536 return None
537 assert type(s) == compat_str
d77c3dfd 538
4e408e47 539 return re.sub(
55b2f099 540 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 541
8bf48f23 542
aa49acd1
S
543def get_subprocess_encoding():
544 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
545 # For subprocess calls, encode with locale encoding
546 # Refer to http://stackoverflow.com/a/9951851/35070
547 encoding = preferredencoding()
548 else:
549 encoding = sys.getfilesystemencoding()
550 if encoding is None:
551 encoding = 'utf-8'
552 return encoding
553
554
8bf48f23 555def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
556 """
557 @param s The name of the file
558 """
d77c3dfd 559
8bf48f23 560 assert type(s) == compat_str
d77c3dfd 561
59ae15a5
PH
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
564 return s
0f00efed 565
aa49acd1
S
566 # Pass '' directly to use Unicode APIs on Windows 2000 and up
567 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
568 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
569 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
570 return s
571
8ee239e9
YCH
572 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
573 if sys.platform.startswith('java'):
574 return s
575
aa49acd1
S
576 return s.encode(get_subprocess_encoding(), 'ignore')
577
578
579def decodeFilename(b, for_subprocess=False):
580
581 if sys.version_info >= (3, 0):
582 return b
583
584 if not isinstance(b, bytes):
585 return b
586
587 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 588
f07b74fc
PH
589
590def encodeArgument(s):
591 if not isinstance(s, compat_str):
592 # Legacy code that uses byte strings
593 # Uncomment the following line after fixing all post processors
7af808a5 594 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
595 s = s.decode('ascii')
596 return encodeFilename(s, True)
597
598
aa49acd1
S
599def decodeArgument(b):
600 return decodeFilename(b, True)
601
602
8271226a
PH
603def decodeOption(optval):
604 if optval is None:
605 return optval
606 if isinstance(optval, bytes):
607 optval = optval.decode(preferredencoding())
608
609 assert isinstance(optval, compat_str)
610 return optval
1c256f70 611
5f6a1245 612
4539dd30
PH
613def formatSeconds(secs):
614 if secs > 3600:
615 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
616 elif secs > 60:
617 return '%d:%02d' % (secs // 60, secs % 60)
618 else:
619 return '%d' % secs
620
a0ddb8a2 621
be4a824d
PH
622def make_HTTPS_handler(params, **kwargs):
623 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 624 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 625 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 626 if opts_no_check_certificate:
be5f2c19 627 context.check_hostname = False
0db261ba 628 context.verify_mode = ssl.CERT_NONE
a2366922 629 try:
be4a824d 630 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
631 except TypeError:
632 # Python 2.7.8
633 # (create_default_context present but HTTPSHandler has no context=)
634 pass
635
636 if sys.version_info < (3, 2):
d7932313 637 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 638 else: # Python < 3.4
d7932313 639 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 640 context.verify_mode = (ssl.CERT_NONE
dca08720 641 if opts_no_check_certificate
ea6d901e 642 else ssl.CERT_REQUIRED)
303b479e 643 context.set_default_verify_paths()
be4a824d 644 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 645
732ea2f0 646
08f2a92c
JMF
647def bug_reports_message():
648 if ytdl_is_updateable():
649 update_cmd = 'type youtube-dl -U to update'
650 else:
651 update_cmd = 'see https://yt-dl.org/update on how to update'
652 msg = '; please report this issue on https://yt-dl.org/bug .'
653 msg += ' Make sure you are using the latest version; %s.' % update_cmd
654 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
655 return msg
656
657
1c256f70
PH
658class ExtractorError(Exception):
659 """Error during info extraction."""
5f6a1245 660
d11271dd 661 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
662 """ tb, if given, is the original traceback (so that it can be printed out).
663 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
664 """
665
666 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
667 expected = True
d11271dd
PH
668 if video_id is not None:
669 msg = video_id + ': ' + msg
410f3e73 670 if cause:
28e614de 671 msg += ' (caused by %r)' % cause
9a82b238 672 if not expected:
08f2a92c 673 msg += bug_reports_message()
1c256f70 674 super(ExtractorError, self).__init__(msg)
d5979c5d 675
1c256f70 676 self.traceback = tb
8cc83b8d 677 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 678 self.cause = cause
d11271dd 679 self.video_id = video_id
1c256f70 680
01951dda
PH
681 def format_traceback(self):
682 if self.traceback is None:
683 return None
28e614de 684 return ''.join(traceback.format_tb(self.traceback))
01951dda 685
1c256f70 686
416c7fcb
PH
687class UnsupportedError(ExtractorError):
688 def __init__(self, url):
689 super(UnsupportedError, self).__init__(
690 'Unsupported URL: %s' % url, expected=True)
691 self.url = url
692
693
55b3e45b
JMF
694class RegexNotFoundError(ExtractorError):
695 """Error when a regex didn't match"""
696 pass
697
698
d77c3dfd 699class DownloadError(Exception):
59ae15a5 700 """Download Error exception.
d77c3dfd 701
59ae15a5
PH
702 This exception may be thrown by FileDownloader objects if they are not
703 configured to continue on errors. They will contain the appropriate
704 error message.
705 """
5f6a1245 706
8cc83b8d
FV
707 def __init__(self, msg, exc_info=None):
708 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
709 super(DownloadError, self).__init__(msg)
710 self.exc_info = exc_info
d77c3dfd
FV
711
712
713class SameFileError(Exception):
59ae15a5 714 """Same File exception.
d77c3dfd 715
59ae15a5
PH
716 This exception will be thrown by FileDownloader objects if they detect
717 multiple files would have to be downloaded to the same file on disk.
718 """
719 pass
d77c3dfd
FV
720
721
722class PostProcessingError(Exception):
59ae15a5 723 """Post Processing exception.
d77c3dfd 724
59ae15a5
PH
725 This exception may be raised by PostProcessor's .run() method to
726 indicate an error in the postprocessing task.
727 """
5f6a1245 728
7851b379
PH
729 def __init__(self, msg):
730 self.msg = msg
d77c3dfd 731
5f6a1245 732
d77c3dfd 733class MaxDownloadsReached(Exception):
59ae15a5
PH
734 """ --max-downloads limit has been reached. """
735 pass
d77c3dfd
FV
736
737
738class UnavailableVideoError(Exception):
59ae15a5 739 """Unavailable Format exception.
d77c3dfd 740
59ae15a5
PH
741 This exception will be thrown when a video is requested
742 in a format that is not available for that video.
743 """
744 pass
d77c3dfd
FV
745
746
747class ContentTooShortError(Exception):
59ae15a5 748 """Content Too Short exception.
d77c3dfd 749
59ae15a5
PH
750 This exception may be raised by FileDownloader objects when a file they
751 download is too small for what the server announced first, indicating
752 the connection was probably interrupted.
753 """
d77c3dfd 754
59ae15a5 755 def __init__(self, downloaded, expected):
2c7ed247 756 # Both in bytes
59ae15a5
PH
757 self.downloaded = downloaded
758 self.expected = expected
d77c3dfd 759
5f6a1245 760
c5a59d93 761def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
762 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
763 # expected HTTP responses to meet HTTP/1.0 or later (see also
764 # https://github.com/rg3/youtube-dl/issues/6727)
765 if sys.version_info < (3, 0):
5a1a2e94 766 kwargs[b'strict'] = True
be4a824d
PH
767 hc = http_class(*args, **kwargs)
768 source_address = ydl_handler._params.get('source_address')
769 if source_address is not None:
770 sa = (source_address, 0)
771 if hasattr(hc, 'source_address'): # Python 2.7+
772 hc.source_address = sa
773 else: # Python 2.6
774 def _hc_connect(self, *args, **kwargs):
775 sock = compat_socket_create_connection(
776 (self.host, self.port), self.timeout, sa)
777 if is_https:
d7932313
PH
778 self.sock = ssl.wrap_socket(
779 sock, self.key_file, self.cert_file,
780 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
781 else:
782 self.sock = sock
783 hc.connect = functools.partial(_hc_connect, hc)
784
785 return hc
786
787
87f0e62d 788def handle_youtubedl_headers(headers):
992fc9d6
YCH
789 filtered_headers = headers
790
791 if 'Youtubedl-no-compression' in filtered_headers:
792 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 793 del filtered_headers['Youtubedl-no-compression']
87f0e62d 794
992fc9d6 795 return filtered_headers
87f0e62d
YCH
796
797
acebc9cd 798class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
799 """Handler for HTTP requests and responses.
800
801 This class, when installed with an OpenerDirector, automatically adds
802 the standard headers to every HTTP request and handles gzipped and
803 deflated responses from web servers. If compression is to be avoided in
804 a particular request, the original request in the program code only has
0424ec30 805 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
806 removed before making the real request.
807
808 Part of this code was copied from:
809
810 http://techknack.net/python-urllib2-handlers/
811
812 Andrew Rowls, the author of that code, agreed to release it to the
813 public domain.
814 """
815
be4a824d
PH
816 def __init__(self, params, *args, **kwargs):
817 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
818 self._params = params
819
820 def http_open(self, req):
71aff188
YCH
821 conn_class = compat_http_client.HTTPConnection
822
823 socks_proxy = req.headers.get('Ytdl-socks-proxy')
824 if socks_proxy:
825 conn_class = make_socks_conn_class(conn_class, socks_proxy)
826 del req.headers['Ytdl-socks-proxy']
827
be4a824d 828 return self.do_open(functools.partial(
71aff188 829 _create_http_connection, self, conn_class, False),
be4a824d
PH
830 req)
831
59ae15a5
PH
832 @staticmethod
833 def deflate(data):
834 try:
835 return zlib.decompress(data, -zlib.MAX_WBITS)
836 except zlib.error:
837 return zlib.decompress(data)
838
839 @staticmethod
840 def addinfourl_wrapper(stream, headers, url, code):
841 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
842 return compat_urllib_request.addinfourl(stream, headers, url, code)
843 ret = compat_urllib_request.addinfourl(stream, headers, url)
844 ret.code = code
845 return ret
846
acebc9cd 847 def http_request(self, req):
51f267d9
S
848 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
849 # always respected by websites, some tend to give out URLs with non percent-encoded
850 # non-ASCII characters (see telemb.py, ard.py [#3412])
851 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
852 # To work around aforementioned issue we will replace request's original URL with
853 # percent-encoded one
854 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
855 # the code of this workaround has been moved here from YoutubeDL.urlopen()
856 url = req.get_full_url()
857 url_escaped = escape_url(url)
858
859 # Substitute URL if any change after escaping
860 if url != url_escaped:
15d260eb 861 req = update_Request(req, url=url_escaped)
51f267d9 862
33ac271b 863 for h, v in std_headers.items():
3d5f7a39
JK
864 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
865 # The dict keys are capitalized because of this bug by urllib
866 if h.capitalize() not in req.headers:
33ac271b 867 req.add_header(h, v)
87f0e62d
YCH
868
869 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
870
871 if sys.version_info < (2, 7) and '#' in req.get_full_url():
872 # Python 2.6 is brain-dead when it comes to fragments
873 req._Request__original = req._Request__original.partition('#')[0]
874 req._Request__r_type = req._Request__r_type.partition('#')[0]
875
59ae15a5
PH
876 return req
877
acebc9cd 878 def http_response(self, req, resp):
59ae15a5
PH
879 old_resp = resp
880 # gzip
881 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
882 content = resp.read()
883 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
884 try:
885 uncompressed = io.BytesIO(gz.read())
886 except IOError as original_ioerror:
887 # There may be junk add the end of the file
888 # See http://stackoverflow.com/q/4928560/35070 for details
889 for i in range(1, 1024):
890 try:
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
892 uncompressed = io.BytesIO(gz.read())
893 except IOError:
894 continue
895 break
896 else:
897 raise original_ioerror
898 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 899 resp.msg = old_resp.msg
c047270c 900 del resp.headers['Content-encoding']
59ae15a5
PH
901 # deflate
902 if resp.headers.get('Content-encoding', '') == 'deflate':
903 gz = io.BytesIO(self.deflate(resp.read()))
904 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
905 resp.msg = old_resp.msg
c047270c 906 del resp.headers['Content-encoding']
ad729172
S
907 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
908 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
909 if 300 <= resp.code < 400:
910 location = resp.headers.get('Location')
911 if location:
912 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
913 if sys.version_info >= (3, 0):
914 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
915 else:
916 location = location.decode('utf-8')
5a4d9ddb
S
917 location_escaped = escape_url(location)
918 if location != location_escaped:
919 del resp.headers['Location']
9a4aec8b
YCH
920 if sys.version_info < (3, 0):
921 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 922 resp.headers['Location'] = location_escaped
59ae15a5 923 return resp
0f8d03f8 924
acebc9cd
PH
925 https_request = http_request
926 https_response = http_response
bf50b038 927
5de90176 928
71aff188
YCH
929def make_socks_conn_class(base_class, socks_proxy):
930 assert issubclass(base_class, (
931 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
932
933 url_components = compat_urlparse.urlparse(socks_proxy)
934 if url_components.scheme.lower() == 'socks5':
935 socks_type = ProxyType.SOCKS5
936 elif url_components.scheme.lower() in ('socks', 'socks4'):
937 socks_type = ProxyType.SOCKS4
51fb4995
YCH
938 elif url_components.scheme.lower() == 'socks4a':
939 socks_type = ProxyType.SOCKS4A
71aff188 940
cdd94c2e
YCH
941 def unquote_if_non_empty(s):
942 if not s:
943 return s
944 return compat_urllib_parse_unquote_plus(s)
945
71aff188
YCH
946 proxy_args = (
947 socks_type,
948 url_components.hostname, url_components.port or 1080,
949 True, # Remote DNS
cdd94c2e
YCH
950 unquote_if_non_empty(url_components.username),
951 unquote_if_non_empty(url_components.password),
71aff188
YCH
952 )
953
954 class SocksConnection(base_class):
955 def connect(self):
956 self.sock = sockssocket()
957 self.sock.setproxy(*proxy_args)
958 if type(self.timeout) in (int, float):
959 self.sock.settimeout(self.timeout)
960 self.sock.connect((self.host, self.port))
961
962 if isinstance(self, compat_http_client.HTTPSConnection):
963 if hasattr(self, '_context'): # Python > 2.6
964 self.sock = self._context.wrap_socket(
965 self.sock, server_hostname=self.host)
966 else:
967 self.sock = ssl.wrap_socket(self.sock)
968
969 return SocksConnection
970
971
be4a824d
PH
972class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
973 def __init__(self, params, https_conn_class=None, *args, **kwargs):
974 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
975 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
976 self._params = params
977
978 def https_open(self, req):
4f264c02 979 kwargs = {}
71aff188
YCH
980 conn_class = self._https_conn_class
981
4f264c02
JMF
982 if hasattr(self, '_context'): # python > 2.6
983 kwargs['context'] = self._context
984 if hasattr(self, '_check_hostname'): # python 3.x
985 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
986
987 socks_proxy = req.headers.get('Ytdl-socks-proxy')
988 if socks_proxy:
989 conn_class = make_socks_conn_class(conn_class, socks_proxy)
990 del req.headers['Ytdl-socks-proxy']
991
be4a824d 992 return self.do_open(functools.partial(
71aff188 993 _create_http_connection, self, conn_class, True),
4f264c02 994 req, **kwargs)
be4a824d
PH
995
996
a6420bf5
S
997class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
998 def __init__(self, cookiejar=None):
999 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001 def http_response(self, request, response):
1002 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003 # characters in Set-Cookie HTTP header of last response (see
1004 # https://github.com/rg3/youtube-dl/issues/6769).
1005 # In order to at least prevent crashing we will percent encode Set-Cookie
1006 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1007 # if sys.version_info < (3, 0) and response.headers:
1008 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009 # set_cookie = response.headers.get(set_cookie_header)
1010 # if set_cookie:
1011 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012 # if set_cookie != set_cookie_escaped:
1013 # del response.headers[set_cookie_header]
1014 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1015 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018 https_response = http_response
1019
1020
46f59e89
S
1021def extract_timezone(date_str):
1022 m = re.search(
1023 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024 date_str)
1025 if not m:
1026 timezone = datetime.timedelta()
1027 else:
1028 date_str = date_str[:-len(m.group('tz'))]
1029 if not m.group('sign'):
1030 timezone = datetime.timedelta()
1031 else:
1032 sign = 1 if m.group('sign') == '+' else -1
1033 timezone = datetime.timedelta(
1034 hours=sign * int(m.group('hours')),
1035 minutes=sign * int(m.group('minutes')))
1036 return timezone, date_str
1037
1038
08b38d54 1039def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1040 """ Return a UNIX timestamp from the given date """
1041
1042 if date_str is None:
1043 return None
1044
52c3a6e4
S
1045 date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
08b38d54 1047 if timezone is None:
46f59e89
S
1048 timezone, date_str = extract_timezone(date_str)
1049
52c3a6e4
S
1050 try:
1051 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053 return calendar.timegm(dt.timetuple())
1054 except ValueError:
1055 pass
912b38b4
PH
1056
1057
46f59e89
S
1058def date_formats(day_first=True):
1059 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
42bdd9d0 1062def unified_strdate(date_str, day_first=True):
bf50b038 1063 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1064
1065 if date_str is None:
1066 return None
bf50b038 1067 upload_date = None
5f6a1245 1068 # Replace commas
026fcc04 1069 date_str = date_str.replace(',', ' ')
42bdd9d0 1070 # Remove AM/PM + timezone
9bb8e0a3 1071 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1072 _, date_str = extract_timezone(date_str)
42bdd9d0 1073
46f59e89 1074 for expression in date_formats(day_first):
bf50b038
JMF
1075 try:
1076 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1077 except ValueError:
bf50b038 1078 pass
42393ce2
PH
1079 if upload_date is None:
1080 timetuple = email.utils.parsedate_tz(date_str)
1081 if timetuple:
c6b9cf05
S
1082 try:
1083 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084 except ValueError:
1085 pass
6a750402
JMF
1086 if upload_date is not None:
1087 return compat_str(upload_date)
bf50b038 1088
5f6a1245 1089
46f59e89
S
1090def unified_timestamp(date_str, day_first=True):
1091 if date_str is None:
1092 return None
1093
1094 date_str = date_str.replace(',', ' ')
1095
1096 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097 timezone, date_str = extract_timezone(date_str)
1098
1099 # Remove AM/PM + timezone
1100 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102 for expression in date_formats(day_first):
1103 try:
1104 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105 return calendar.timegm(dt.timetuple())
1106 except ValueError:
1107 pass
1108 timetuple = email.utils.parsedate_tz(date_str)
1109 if timetuple:
1110 return calendar.timegm(timetuple.timetuple())
1111
1112
28e614de 1113def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1114 if url is None:
1115 return default_ext
9cb9a5df 1116 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1117 if re.match(r'^[A-Za-z0-9]+$', guess):
1118 return guess
a7aaa398
S
1119 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1121 return guess.rstrip('/')
73e79f2a 1122 else:
cbdbb766 1123 return default_ext
73e79f2a 1124
5f6a1245 1125
d4051a8e 1126def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1127 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1128
5f6a1245 1129
bd558525 1130def date_from_str(date_str):
37254abc
JMF
1131 """
1132 Return a datetime object from a string in the format YYYYMMDD or
1133 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134 today = datetime.date.today()
f8795e10 1135 if date_str in ('now', 'today'):
37254abc 1136 return today
f8795e10
PH
1137 if date_str == 'yesterday':
1138 return today - datetime.timedelta(days=1)
37254abc
JMF
1139 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140 if match is not None:
1141 sign = match.group('sign')
1142 time = int(match.group('time'))
1143 if sign == '-':
1144 time = -time
1145 unit = match.group('unit')
dfb1b146 1146 # A bad approximation?
37254abc
JMF
1147 if unit == 'month':
1148 unit = 'day'
1149 time *= 30
1150 elif unit == 'year':
1151 unit = 'day'
1152 time *= 365
1153 unit += 's'
1154 delta = datetime.timedelta(**{unit: time})
1155 return today + delta
611c1dd9 1156 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1157
1158
e63fc1be 1159def hyphenate_date(date_str):
1160 """
1161 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163 if match is not None:
1164 return '-'.join(match.groups())
1165 else:
1166 return date_str
1167
5f6a1245 1168
bd558525
JMF
1169class DateRange(object):
1170 """Represents a time interval between two dates"""
5f6a1245 1171
bd558525
JMF
1172 def __init__(self, start=None, end=None):
1173 """start and end must be strings in the format accepted by date"""
1174 if start is not None:
1175 self.start = date_from_str(start)
1176 else:
1177 self.start = datetime.datetime.min.date()
1178 if end is not None:
1179 self.end = date_from_str(end)
1180 else:
1181 self.end = datetime.datetime.max.date()
37254abc 1182 if self.start > self.end:
bd558525 1183 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1184
bd558525
JMF
1185 @classmethod
1186 def day(cls, day):
1187 """Returns a range that only contains the given day"""
5f6a1245
JW
1188 return cls(day, day)
1189
bd558525
JMF
1190 def __contains__(self, date):
1191 """Check if the date is in the range"""
37254abc
JMF
1192 if not isinstance(date, datetime.date):
1193 date = date_from_str(date)
1194 return self.start <= date <= self.end
5f6a1245 1195
bd558525 1196 def __str__(self):
5f6a1245 1197 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1198
1199
1200def platform_name():
1201 """ Returns the platform name as a compat_str """
1202 res = platform.platform()
1203 if isinstance(res, bytes):
1204 res = res.decode(preferredencoding())
1205
1206 assert isinstance(res, compat_str)
1207 return res
c257baff
PH
1208
1209
b58ddb32
PH
1210def _windows_write_string(s, out):
1211 """ Returns True if the string was written using special methods,
1212 False if it has yet to be written out."""
1213 # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215 import ctypes
1216 import ctypes.wintypes
1217
1218 WIN_OUTPUT_IDS = {
1219 1: -11,
1220 2: -12,
1221 }
1222
a383a98a
PH
1223 try:
1224 fileno = out.fileno()
1225 except AttributeError:
1226 # If the output stream doesn't have a fileno, it's virtual
1227 return False
aa42e873
PH
1228 except io.UnsupportedOperation:
1229 # Some strange Windows pseudo files?
1230 return False
b58ddb32
PH
1231 if fileno not in WIN_OUTPUT_IDS:
1232 return False
1233
e2f89ec7 1234 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1235 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1236 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1237 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
e2f89ec7 1239 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1240 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1242 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1243 written = ctypes.wintypes.DWORD(0)
1244
611c1dd9 1245 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1246 FILE_TYPE_CHAR = 0x0002
1247 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1248 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1249 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1251 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1252 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254 def not_a_console(handle):
1255 if handle == INVALID_HANDLE_VALUE or handle is None:
1256 return True
8fb3ac36
PH
1257 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1259
1260 if not_a_console(h):
1261 return False
1262
d1b9c912
PH
1263 def next_nonbmp_pos(s):
1264 try:
1265 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266 except StopIteration:
1267 return len(s)
1268
1269 while s:
1270 count = min(next_nonbmp_pos(s), 1024)
1271
b58ddb32 1272 ret = WriteConsoleW(
d1b9c912 1273 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1274 if ret == 0:
1275 raise OSError('Failed to write string')
d1b9c912
PH
1276 if not count: # We just wrote a non-BMP character
1277 assert written.value == 2
1278 s = s[1:]
1279 else:
1280 assert written.value > 0
1281 s = s[written.value:]
b58ddb32
PH
1282 return True
1283
1284
734f90bb 1285def write_string(s, out=None, encoding=None):
7459e3a2
PH
1286 if out is None:
1287 out = sys.stderr
8bf48f23 1288 assert type(s) == compat_str
7459e3a2 1289
b58ddb32
PH
1290 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291 if _windows_write_string(s, out):
1292 return
1293
7459e3a2
PH
1294 if ('b' in getattr(out, 'mode', '') or
1295 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1296 byt = s.encode(encoding or preferredencoding(), 'ignore')
1297 out.write(byt)
1298 elif hasattr(out, 'buffer'):
1299 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300 byt = s.encode(enc, 'ignore')
1301 out.buffer.write(byt)
1302 else:
8bf48f23 1303 out.write(s)
7459e3a2
PH
1304 out.flush()
1305
1306
48ea9cea
PH
1307def bytes_to_intlist(bs):
1308 if not bs:
1309 return []
1310 if isinstance(bs[0], int): # Python 3
1311 return list(bs)
1312 else:
1313 return [ord(c) for c in bs]
1314
c257baff 1315
cba892fa 1316def intlist_to_bytes(xs):
1317 if not xs:
1318 return b''
edaa23f8 1319 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1320
1321
c1c9a79c
PH
1322# Cross-platform file locking
1323if sys.platform == 'win32':
1324 import ctypes.wintypes
1325 import msvcrt
1326
1327 class OVERLAPPED(ctypes.Structure):
1328 _fields_ = [
1329 ('Internal', ctypes.wintypes.LPVOID),
1330 ('InternalHigh', ctypes.wintypes.LPVOID),
1331 ('Offset', ctypes.wintypes.DWORD),
1332 ('OffsetHigh', ctypes.wintypes.DWORD),
1333 ('hEvent', ctypes.wintypes.HANDLE),
1334 ]
1335
1336 kernel32 = ctypes.windll.kernel32
1337 LockFileEx = kernel32.LockFileEx
1338 LockFileEx.argtypes = [
1339 ctypes.wintypes.HANDLE, # hFile
1340 ctypes.wintypes.DWORD, # dwFlags
1341 ctypes.wintypes.DWORD, # dwReserved
1342 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1343 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1344 ctypes.POINTER(OVERLAPPED) # Overlapped
1345 ]
1346 LockFileEx.restype = ctypes.wintypes.BOOL
1347 UnlockFileEx = kernel32.UnlockFileEx
1348 UnlockFileEx.argtypes = [
1349 ctypes.wintypes.HANDLE, # hFile
1350 ctypes.wintypes.DWORD, # dwReserved
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1352 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1353 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 ]
1355 UnlockFileEx.restype = ctypes.wintypes.BOOL
1356 whole_low = 0xffffffff
1357 whole_high = 0x7fffffff
1358
1359 def _lock_file(f, exclusive):
1360 overlapped = OVERLAPPED()
1361 overlapped.Offset = 0
1362 overlapped.OffsetHigh = 0
1363 overlapped.hEvent = 0
1364 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365 handle = msvcrt.get_osfhandle(f.fileno())
1366 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367 whole_low, whole_high, f._lock_file_overlapped_p):
1368 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370 def _unlock_file(f):
1371 assert f._lock_file_overlapped_p
1372 handle = msvcrt.get_osfhandle(f.fileno())
1373 if not UnlockFileEx(handle, 0,
1374 whole_low, whole_high, f._lock_file_overlapped_p):
1375 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377else:
399a76e6
YCH
1378 # Some platforms, such as Jython, is missing fcntl
1379 try:
1380 import fcntl
c1c9a79c 1381
399a76e6
YCH
1382 def _lock_file(f, exclusive):
1383 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1384
399a76e6
YCH
1385 def _unlock_file(f):
1386 fcntl.flock(f, fcntl.LOCK_UN)
1387 except ImportError:
1388 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390 def _lock_file(f, exclusive):
1391 raise IOError(UNSUPPORTED_MSG)
1392
1393 def _unlock_file(f):
1394 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1395
1396
1397class locked_file(object):
1398 def __init__(self, filename, mode, encoding=None):
1399 assert mode in ['r', 'a', 'w']
1400 self.f = io.open(filename, mode, encoding=encoding)
1401 self.mode = mode
1402
1403 def __enter__(self):
1404 exclusive = self.mode != 'r'
1405 try:
1406 _lock_file(self.f, exclusive)
1407 except IOError:
1408 self.f.close()
1409 raise
1410 return self
1411
1412 def __exit__(self, etype, value, traceback):
1413 try:
1414 _unlock_file(self.f)
1415 finally:
1416 self.f.close()
1417
1418 def __iter__(self):
1419 return iter(self.f)
1420
1421 def write(self, *args):
1422 return self.f.write(*args)
1423
1424 def read(self, *args):
1425 return self.f.read(*args)
4eb7f1d1
JMF
1426
1427
4644ac55
S
1428def get_filesystem_encoding():
1429 encoding = sys.getfilesystemencoding()
1430 return encoding if encoding is not None else 'utf-8'
1431
1432
4eb7f1d1 1433def shell_quote(args):
a6a173c2 1434 quoted_args = []
4644ac55 1435 encoding = get_filesystem_encoding()
a6a173c2
JMF
1436 for a in args:
1437 if isinstance(a, bytes):
1438 # We may get a filename encoded with 'encodeFilename'
1439 a = a.decode(encoding)
1440 quoted_args.append(pipes.quote(a))
28e614de 1441 return ' '.join(quoted_args)
9d4660ca
PH
1442
1443
1444def smuggle_url(url, data):
1445 """ Pass additional data in a URL for internal use. """
1446
15707c7e 1447 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1448 {'__youtubedl_smuggle': json.dumps(data)})
1449 return url + '#' + sdata
9d4660ca
PH
1450
1451
79f82953 1452def unsmuggle_url(smug_url, default=None):
83e865a3 1453 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1454 return smug_url, default
28e614de
PH
1455 url, _, sdata = smug_url.rpartition('#')
1456 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1457 data = json.loads(jsond)
1458 return url, data
02dbf93f
PH
1459
1460
02dbf93f
PH
1461def format_bytes(bytes):
1462 if bytes is None:
28e614de 1463 return 'N/A'
02dbf93f
PH
1464 if type(bytes) is str:
1465 bytes = float(bytes)
1466 if bytes == 0.0:
1467 exponent = 0
1468 else:
1469 exponent = int(math.log(bytes, 1024.0))
28e614de 1470 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1471 converted = float(bytes) / float(1024 ** exponent)
28e614de 1472 return '%.2f%s' % (converted, suffix)
f53c966a 1473
1c088fa8 1474
fb47597b
S
1475def lookup_unit_table(unit_table, s):
1476 units_re = '|'.join(re.escape(u) for u in unit_table)
1477 m = re.match(
782b1b5b 1478 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1479 if not m:
1480 return None
1481 num_str = m.group('num').replace(',', '.')
1482 mult = unit_table[m.group('unit')]
1483 return int(float(num_str) * mult)
1484
1485
be64b5b0
PH
1486def parse_filesize(s):
1487 if s is None:
1488 return None
1489
dfb1b146 1490 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1491 # but we support those too
1492 _UNIT_TABLE = {
1493 'B': 1,
1494 'b': 1,
1495 'KiB': 1024,
1496 'KB': 1000,
1497 'kB': 1024,
1498 'Kb': 1000,
1499 'MiB': 1024 ** 2,
1500 'MB': 1000 ** 2,
1501 'mB': 1024 ** 2,
1502 'Mb': 1000 ** 2,
1503 'GiB': 1024 ** 3,
1504 'GB': 1000 ** 3,
1505 'gB': 1024 ** 3,
1506 'Gb': 1000 ** 3,
1507 'TiB': 1024 ** 4,
1508 'TB': 1000 ** 4,
1509 'tB': 1024 ** 4,
1510 'Tb': 1000 ** 4,
1511 'PiB': 1024 ** 5,
1512 'PB': 1000 ** 5,
1513 'pB': 1024 ** 5,
1514 'Pb': 1000 ** 5,
1515 'EiB': 1024 ** 6,
1516 'EB': 1000 ** 6,
1517 'eB': 1024 ** 6,
1518 'Eb': 1000 ** 6,
1519 'ZiB': 1024 ** 7,
1520 'ZB': 1000 ** 7,
1521 'zB': 1024 ** 7,
1522 'Zb': 1000 ** 7,
1523 'YiB': 1024 ** 8,
1524 'YB': 1000 ** 8,
1525 'yB': 1024 ** 8,
1526 'Yb': 1000 ** 8,
1527 }
1528
fb47597b
S
1529 return lookup_unit_table(_UNIT_TABLE, s)
1530
1531
1532def parse_count(s):
1533 if s is None:
be64b5b0
PH
1534 return None
1535
fb47597b
S
1536 s = s.strip()
1537
1538 if re.match(r'^[\d,.]+$', s):
1539 return str_to_int(s)
1540
1541 _UNIT_TABLE = {
1542 'k': 1000,
1543 'K': 1000,
1544 'm': 1000 ** 2,
1545 'M': 1000 ** 2,
1546 'kk': 1000 ** 2,
1547 'KK': 1000 ** 2,
1548 }
be64b5b0 1549
fb47597b 1550 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1551
2f7ae819 1552
caefb1de
PH
1553def month_by_name(name):
1554 """ Return the number of a month by (locale-independently) English name """
1555
caefb1de 1556 try:
7105440c
YCH
1557 return ENGLISH_MONTH_NAMES.index(name) + 1
1558 except ValueError:
1559 return None
1560
1561
1562def month_by_abbreviation(abbrev):
1563 """ Return the number of a month by (locale-independently) English
1564 abbreviations """
1565
1566 try:
1567 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1568 except ValueError:
1569 return None
18258362
JMF
1570
1571
5aafe895 1572def fix_xml_ampersands(xml_str):
18258362 1573 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1574 return re.sub(
1575 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1576 '&amp;',
5aafe895 1577 xml_str)
e3946f98
PH
1578
1579
1580def setproctitle(title):
8bf48f23 1581 assert isinstance(title, compat_str)
c1c05c67
YCH
1582
1583 # ctypes in Jython is not complete
1584 # http://bugs.jython.org/issue2148
1585 if sys.platform.startswith('java'):
1586 return
1587
e3946f98 1588 try:
611c1dd9 1589 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1590 except OSError:
1591 return
6eefe533
PH
1592 title_bytes = title.encode('utf-8')
1593 buf = ctypes.create_string_buffer(len(title_bytes))
1594 buf.value = title_bytes
e3946f98 1595 try:
6eefe533 1596 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1597 except AttributeError:
1598 return # Strange libc, just skip this
d7dda168
PH
1599
1600
1601def remove_start(s, start):
46bc9b7d 1602 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1603
1604
2b9faf55 1605def remove_end(s, end):
46bc9b7d 1606 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1607
1608
31b2051e
S
1609def remove_quotes(s):
1610 if s is None or len(s) < 2:
1611 return s
1612 for quote in ('"', "'", ):
1613 if s[0] == quote and s[-1] == quote:
1614 return s[1:-1]
1615 return s
1616
1617
29eb5174 1618def url_basename(url):
9b8aaeed 1619 path = compat_urlparse.urlparse(url).path
28e614de 1620 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1621
1622
1623class HEADRequest(compat_urllib_request.Request):
1624 def get_method(self):
611c1dd9 1625 return 'HEAD'
7217e148
PH
1626
1627
9732d77e 1628def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1629 if get_attr:
1630 if v is not None:
1631 v = getattr(v, get_attr, None)
9572013d
PH
1632 if v == '':
1633 v = None
1812afb7
S
1634 if v is None:
1635 return default
1636 try:
1637 return int(v) * invscale // scale
1638 except ValueError:
af98f8ff 1639 return default
9732d77e 1640
9572013d 1641
40a90862
JMF
1642def str_or_none(v, default=None):
1643 return default if v is None else compat_str(v)
1644
9732d77e
PH
1645
1646def str_to_int(int_str):
48d4681e 1647 """ A more relaxed version of int_or_none """
9732d77e
PH
1648 if int_str is None:
1649 return None
28e614de 1650 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1651 return int(int_str)
608d11f5
PH
1652
1653
9732d77e 1654def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1655 if v is None:
1656 return default
1657 try:
1658 return float(v) * invscale / scale
1659 except ValueError:
1660 return default
43f775e4
PH
1661
1662
608d11f5 1663def parse_duration(s):
8f9312c3 1664 if not isinstance(s, compat_basestring):
608d11f5
PH
1665 return None
1666
ca7b3246
S
1667 s = s.strip()
1668
acaff495 1669 days, hours, mins, secs, ms = [None] * 5
1670 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1671 if m:
1672 days, hours, mins, secs, ms = m.groups()
1673 else:
1674 m = re.match(
1675 r'''(?ix)(?:P?T)?
8f4b58d7 1676 (?:
acaff495 1677 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1678 )?
acaff495 1679 (?:
1680 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1681 )?
1682 (?:
1683 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1684 )?
1685 (?:
1686 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1687 )?$''', s)
1688 if m:
1689 days, hours, mins, secs, ms = m.groups()
1690 else:
1691 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1692 if m:
1693 hours, mins = m.groups()
1694 else:
1695 return None
1696
1697 duration = 0
1698 if secs:
1699 duration += float(secs)
1700 if mins:
1701 duration += float(mins) * 60
1702 if hours:
1703 duration += float(hours) * 60 * 60
1704 if days:
1705 duration += float(days) * 24 * 60 * 60
1706 if ms:
1707 duration += float(ms)
1708 return duration
91d7d0b3
JMF
1709
1710
e65e4c88 1711def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1712 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1713 return (
1714 '{0}.{1}{2}'.format(name, ext, real_ext)
1715 if not expected_real_ext or real_ext[1:] == expected_real_ext
1716 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1717
1718
b3ed15b7
S
1719def replace_extension(filename, ext, expected_real_ext=None):
1720 name, real_ext = os.path.splitext(filename)
1721 return '{0}.{1}'.format(
1722 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1723 ext)
1724
1725
d70ad093
PH
1726def check_executable(exe, args=[]):
1727 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1728 args can be a list of arguments for a short output (like -version) """
1729 try:
1730 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1731 except OSError:
1732 return False
1733 return exe
b7ab0590
PH
1734
1735
95807118 1736def get_exe_version(exe, args=['--version'],
cae97f65 1737 version_re=None, unrecognized='present'):
95807118
PH
1738 """ Returns the version of the specified executable,
1739 or False if the executable is not present """
1740 try:
cae97f65 1741 out, _ = subprocess.Popen(
54116803 1742 [encodeArgument(exe)] + args,
95807118
PH
1743 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1744 except OSError:
1745 return False
cae97f65
PH
1746 if isinstance(out, bytes): # Python 2.x
1747 out = out.decode('ascii', 'ignore')
1748 return detect_exe_version(out, version_re, unrecognized)
1749
1750
1751def detect_exe_version(output, version_re=None, unrecognized='present'):
1752 assert isinstance(output, compat_str)
1753 if version_re is None:
1754 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1755 m = re.search(version_re, output)
95807118
PH
1756 if m:
1757 return m.group(1)
1758 else:
1759 return unrecognized
1760
1761
b7ab0590 1762class PagedList(object):
dd26ced1
PH
1763 def __len__(self):
1764 # This is only useful for tests
1765 return len(self.getslice())
1766
9c44d242
PH
1767
1768class OnDemandPagedList(PagedList):
b95dc034 1769 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1770 self._pagefunc = pagefunc
1771 self._pagesize = pagesize
b95dc034
YCH
1772 self._use_cache = use_cache
1773 if use_cache:
1774 self._cache = {}
9c44d242 1775
b7ab0590
PH
1776 def getslice(self, start=0, end=None):
1777 res = []
1778 for pagenum in itertools.count(start // self._pagesize):
1779 firstid = pagenum * self._pagesize
1780 nextfirstid = pagenum * self._pagesize + self._pagesize
1781 if start >= nextfirstid:
1782 continue
1783
b95dc034
YCH
1784 page_results = None
1785 if self._use_cache:
1786 page_results = self._cache.get(pagenum)
1787 if page_results is None:
1788 page_results = list(self._pagefunc(pagenum))
1789 if self._use_cache:
1790 self._cache[pagenum] = page_results
b7ab0590
PH
1791
1792 startv = (
1793 start % self._pagesize
1794 if firstid <= start < nextfirstid
1795 else 0)
1796
1797 endv = (
1798 ((end - 1) % self._pagesize) + 1
1799 if (end is not None and firstid <= end <= nextfirstid)
1800 else None)
1801
1802 if startv != 0 or endv is not None:
1803 page_results = page_results[startv:endv]
1804 res.extend(page_results)
1805
1806 # A little optimization - if current page is not "full", ie. does
1807 # not contain page_size videos then we can assume that this page
1808 # is the last one - there are no more ids on further pages -
1809 # i.e. no need to query again.
1810 if len(page_results) + startv < self._pagesize:
1811 break
1812
1813 # If we got the whole page, but the next page is not interesting,
1814 # break out early as well
1815 if end == nextfirstid:
1816 break
1817 return res
81c2f20b
PH
1818
1819
9c44d242
PH
1820class InAdvancePagedList(PagedList):
1821 def __init__(self, pagefunc, pagecount, pagesize):
1822 self._pagefunc = pagefunc
1823 self._pagecount = pagecount
1824 self._pagesize = pagesize
1825
1826 def getslice(self, start=0, end=None):
1827 res = []
1828 start_page = start // self._pagesize
1829 end_page = (
1830 self._pagecount if end is None else (end // self._pagesize + 1))
1831 skip_elems = start - start_page * self._pagesize
1832 only_more = None if end is None else end - start
1833 for pagenum in range(start_page, end_page):
1834 page = list(self._pagefunc(pagenum))
1835 if skip_elems:
1836 page = page[skip_elems:]
1837 skip_elems = None
1838 if only_more is not None:
1839 if len(page) < only_more:
1840 only_more -= len(page)
1841 else:
1842 page = page[:only_more]
1843 res.extend(page)
1844 break
1845 res.extend(page)
1846 return res
1847
1848
81c2f20b 1849def uppercase_escape(s):
676eb3f2 1850 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1851 return re.sub(
a612753d 1852 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1853 lambda m: unicode_escape(m.group(0))[0],
1854 s)
0fe2ff78
YCH
1855
1856
1857def lowercase_escape(s):
1858 unicode_escape = codecs.getdecoder('unicode_escape')
1859 return re.sub(
1860 r'\\u[0-9a-fA-F]{4}',
1861 lambda m: unicode_escape(m.group(0))[0],
1862 s)
b53466e1 1863
d05cfe06
S
1864
1865def escape_rfc3986(s):
1866 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1867 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1868 s = s.encode('utf-8')
ecc0c5ee 1869 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1870
1871
1872def escape_url(url):
1873 """Escape URL as suggested by RFC 3986"""
1874 url_parsed = compat_urllib_parse_urlparse(url)
1875 return url_parsed._replace(
efbed08d 1876 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1877 path=escape_rfc3986(url_parsed.path),
1878 params=escape_rfc3986(url_parsed.params),
1879 query=escape_rfc3986(url_parsed.query),
1880 fragment=escape_rfc3986(url_parsed.fragment)
1881 ).geturl()
1882
62e609ab
PH
1883
1884def read_batch_urls(batch_fd):
1885 def fixup(url):
1886 if not isinstance(url, compat_str):
1887 url = url.decode('utf-8', 'replace')
28e614de 1888 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1889 if url.startswith(BOM_UTF8):
1890 url = url[len(BOM_UTF8):]
1891 url = url.strip()
1892 if url.startswith(('#', ';', ']')):
1893 return False
1894 return url
1895
1896 with contextlib.closing(batch_fd) as fd:
1897 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1898
1899
1900def urlencode_postdata(*args, **kargs):
15707c7e 1901 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1902
1903
38f9ef31 1904def update_url_query(url, query):
cacd9966
YCH
1905 if not query:
1906 return url
38f9ef31 1907 parsed_url = compat_urlparse.urlparse(url)
1908 qs = compat_parse_qs(parsed_url.query)
1909 qs.update(query)
1910 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1911 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1912
8e60dc75 1913
ed0291d1
S
1914def update_Request(req, url=None, data=None, headers={}, query={}):
1915 req_headers = req.headers.copy()
1916 req_headers.update(headers)
1917 req_data = data or req.data
1918 req_url = update_url_query(url or req.get_full_url(), query)
1919 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1920 new_req = req_type(
1921 req_url, data=req_data, headers=req_headers,
1922 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1923 if hasattr(req, 'timeout'):
1924 new_req.timeout = req.timeout
1925 return new_req
1926
1927
86296ad2 1928def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1929 if isinstance(key_or_keys, (list, tuple)):
1930 for key in key_or_keys:
86296ad2
S
1931 if key not in d or d[key] is None or skip_false_values and not d[key]:
1932 continue
1933 return d[key]
cbecc9b9
S
1934 return default
1935 return d.get(key_or_keys, default)
1936
1937
329ca3be
S
1938def try_get(src, getter, expected_type=None):
1939 try:
1940 v = getter(src)
1941 except (AttributeError, KeyError, TypeError, IndexError):
1942 pass
1943 else:
1944 if expected_type is None or isinstance(v, expected_type):
1945 return v
1946
1947
8e60dc75
S
1948def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1949 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1950
16392824 1951
a1a530b0
PH
1952US_RATINGS = {
1953 'G': 0,
1954 'PG': 10,
1955 'PG-13': 13,
1956 'R': 16,
1957 'NC': 18,
1958}
fac55558
PH
1959
1960
146c80e2
S
1961def parse_age_limit(s):
1962 if s is None:
d838b1bd 1963 return None
146c80e2 1964 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1965 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1966
1967
fac55558 1968def strip_jsonp(code):
609a61e3 1969 return re.sub(
5950cb1d 1970 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1971
1972
e05f6939
PH
1973def js_to_json(code):
1974 def fix_kv(m):
e7b6d122
PH
1975 v = m.group(0)
1976 if v in ('true', 'false', 'null'):
1977 return v
bd1e4844 1978 elif v.startswith('/*') or v == ',':
1979 return ""
1980
1981 if v[0] in ("'", '"'):
1982 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 1983 '"': '\\"',
bd1e4844 1984 "\\'": "'",
1985 '\\\n': '',
1986 '\\x': '\\u00',
1987 }.get(m.group(0), m.group(0)), v[1:-1])
1988
89ac4a19 1989 INTEGER_TABLE = (
cda6d47a
S
1990 (r'^0[xX][0-9a-fA-F]+', 16),
1991 (r'^0+[0-7]+', 8),
89ac4a19
S
1992 )
1993
1994 for regex, base in INTEGER_TABLE:
1995 im = re.match(regex, v)
1996 if im:
cda6d47a 1997 i = int(im.group(0), base)
89ac4a19
S
1998 return '"%d":' % i if v.endswith(':') else '%d' % i
1999
e7b6d122 2000 return '"%s"' % v
e05f6939 2001
bd1e4844 2002 return re.sub(r'''(?sx)
2003 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2004 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2005 /\*.*?\*/|,(?=\s*[\]}])|
2006 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2007 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2008 [0-9]+(?=\s*:)
e05f6939 2009 ''', fix_kv, code)
e05f6939
PH
2010
2011
478c2c61
PH
2012def qualities(quality_ids):
2013 """ Get a numeric quality value out of a list of possible values """
2014 def q(qid):
2015 try:
2016 return quality_ids.index(qid)
2017 except ValueError:
2018 return -1
2019 return q
2020
acd69589
PH
2021
2022DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2023
a020a0dc
PH
2024
2025def limit_length(s, length):
2026 """ Add ellipses to overly long strings """
2027 if s is None:
2028 return None
2029 ELLIPSES = '...'
2030 if len(s) > length:
2031 return s[:length - len(ELLIPSES)] + ELLIPSES
2032 return s
48844745
PH
2033
2034
2035def version_tuple(v):
5f9b8394 2036 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2037
2038
2039def is_outdated_version(version, limit, assume_new=True):
2040 if not version:
2041 return not assume_new
2042 try:
2043 return version_tuple(version) < version_tuple(limit)
2044 except ValueError:
2045 return not assume_new
732ea2f0
PH
2046
2047
2048def ytdl_is_updateable():
2049 """ Returns if youtube-dl can be updated with -U """
2050 from zipimport import zipimporter
2051
2052 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2053
2054
2055def args_to_str(args):
2056 # Get a short string representation for a subprocess command
702ccf2d 2057 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2058
2059
9b9c5355 2060def error_to_compat_str(err):
fdae2358
S
2061 err_str = str(err)
2062 # On python 2 error byte string must be decoded with proper
2063 # encoding rather than ascii
2064 if sys.version_info[0] < 3:
2065 err_str = err_str.decode(preferredencoding())
2066 return err_str
2067
2068
c460bdd5 2069def mimetype2ext(mt):
eb9ee194
S
2070 if mt is None:
2071 return None
2072
765ac263
JMF
2073 ext = {
2074 'audio/mp4': 'm4a',
6c33d24b
YCH
2075 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2076 # it's the most popular one
2077 'audio/mpeg': 'mp3',
765ac263
JMF
2078 }.get(mt)
2079 if ext is not None:
2080 return ext
2081
c460bdd5
PH
2082 _, _, res = mt.rpartition('/')
2083
2084 return {
f6861ec9 2085 '3gpp': '3gp',
cafcf657 2086 'smptett+xml': 'tt',
2087 'srt': 'srt',
2088 'ttaf+xml': 'dfxp',
a0d8d704 2089 'ttml+xml': 'ttml',
cafcf657 2090 'vtt': 'vtt',
f6861ec9 2091 'x-flv': 'flv',
a0d8d704
YCH
2092 'x-mp4-fragmented': 'mp4',
2093 'x-ms-wmv': 'wmv',
c460bdd5
PH
2094 }.get(res, res)
2095
2096
2ccd1b10 2097def urlhandle_detect_ext(url_handle):
79298173 2098 getheader = url_handle.headers.get
2ccd1b10 2099
b55ee18f
PH
2100 cd = getheader('Content-Disposition')
2101 if cd:
2102 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2103 if m:
2104 e = determine_ext(m.group('filename'), default_ext=None)
2105 if e:
2106 return e
2107
c460bdd5 2108 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2109
2110
1e399778
YCH
2111def encode_data_uri(data, mime_type):
2112 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2113
2114
05900629 2115def age_restricted(content_limit, age_limit):
6ec6cb4e 2116 """ Returns True iff the content should be blocked """
05900629
PH
2117
2118 if age_limit is None: # No limit set
2119 return False
2120 if content_limit is None:
2121 return False # Content available for everyone
2122 return age_limit < content_limit
61ca9a80
PH
2123
2124
2125def is_html(first_bytes):
2126 """ Detect whether a file contains HTML by examining its first bytes. """
2127
2128 BOMS = [
2129 (b'\xef\xbb\xbf', 'utf-8'),
2130 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2131 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2132 (b'\xff\xfe', 'utf-16-le'),
2133 (b'\xfe\xff', 'utf-16-be'),
2134 ]
2135 for bom, enc in BOMS:
2136 if first_bytes.startswith(bom):
2137 s = first_bytes[len(bom):].decode(enc, 'replace')
2138 break
2139 else:
2140 s = first_bytes.decode('utf-8', 'replace')
2141
2142 return re.match(r'^\s*<', s)
a055469f
PH
2143
2144
2145def determine_protocol(info_dict):
2146 protocol = info_dict.get('protocol')
2147 if protocol is not None:
2148 return protocol
2149
2150 url = info_dict['url']
2151 if url.startswith('rtmp'):
2152 return 'rtmp'
2153 elif url.startswith('mms'):
2154 return 'mms'
2155 elif url.startswith('rtsp'):
2156 return 'rtsp'
2157
2158 ext = determine_ext(url)
2159 if ext == 'm3u8':
2160 return 'm3u8'
2161 elif ext == 'f4m':
2162 return 'f4m'
2163
2164 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2165
2166
2167def render_table(header_row, data):
2168 """ Render a list of rows, each as a list of values """
2169 table = [header_row] + data
2170 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2171 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2172 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2173
2174
2175def _match_one(filter_part, dct):
2176 COMPARISON_OPERATORS = {
2177 '<': operator.lt,
2178 '<=': operator.le,
2179 '>': operator.gt,
2180 '>=': operator.ge,
2181 '=': operator.eq,
2182 '!=': operator.ne,
2183 }
2184 operator_rex = re.compile(r'''(?x)\s*
2185 (?P<key>[a-z_]+)
2186 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2187 (?:
2188 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2189 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2190 )
2191 \s*$
2192 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2193 m = operator_rex.search(filter_part)
2194 if m:
2195 op = COMPARISON_OPERATORS[m.group('op')]
2196 if m.group('strval') is not None:
2197 if m.group('op') not in ('=', '!='):
2198 raise ValueError(
2199 'Operator %s does not support string values!' % m.group('op'))
2200 comparison_value = m.group('strval')
2201 else:
2202 try:
2203 comparison_value = int(m.group('intval'))
2204 except ValueError:
2205 comparison_value = parse_filesize(m.group('intval'))
2206 if comparison_value is None:
2207 comparison_value = parse_filesize(m.group('intval') + 'B')
2208 if comparison_value is None:
2209 raise ValueError(
2210 'Invalid integer value %r in filter part %r' % (
2211 m.group('intval'), filter_part))
2212 actual_value = dct.get(m.group('key'))
2213 if actual_value is None:
2214 return m.group('none_inclusive')
2215 return op(actual_value, comparison_value)
2216
2217 UNARY_OPERATORS = {
2218 '': lambda v: v is not None,
2219 '!': lambda v: v is None,
2220 }
2221 operator_rex = re.compile(r'''(?x)\s*
2222 (?P<op>%s)\s*(?P<key>[a-z_]+)
2223 \s*$
2224 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2225 m = operator_rex.search(filter_part)
2226 if m:
2227 op = UNARY_OPERATORS[m.group('op')]
2228 actual_value = dct.get(m.group('key'))
2229 return op(actual_value)
2230
2231 raise ValueError('Invalid filter part %r' % filter_part)
2232
2233
2234def match_str(filter_str, dct):
2235 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2236
2237 return all(
2238 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2239
2240
2241def match_filter_func(filter_str):
2242 def _match_func(info_dict):
2243 if match_str(filter_str, info_dict):
2244 return None
2245 else:
2246 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2247 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2248 return _match_func
91410c9b
PH
2249
2250
bf6427d2
YCH
2251def parse_dfxp_time_expr(time_expr):
2252 if not time_expr:
d631d5f9 2253 return
bf6427d2
YCH
2254
2255 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2256 if mobj:
2257 return float(mobj.group('time_offset'))
2258
db2fe38b 2259 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2260 if mobj:
db2fe38b 2261 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2262
2263
c1c924ab
YCH
2264def srt_subtitles_timecode(seconds):
2265 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2266
2267
2268def dfxp2srt(dfxp_data):
4e335771
YCH
2269 _x = functools.partial(xpath_with_ns, ns_map={
2270 'ttml': 'http://www.w3.org/ns/ttml',
2271 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2272 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2273 })
bf6427d2 2274
87de7069 2275 class TTMLPElementParser(object):
2b14cb56 2276 out = ''
bf6427d2 2277
2b14cb56 2278 def start(self, tag, attrib):
2279 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2280 self.out += '\n'
bf6427d2 2281
2b14cb56 2282 def end(self, tag):
2283 pass
bf6427d2 2284
2b14cb56 2285 def data(self, data):
2286 self.out += data
2287
2288 def close(self):
2289 return self.out.strip()
2290
2291 def parse_node(node):
2292 target = TTMLPElementParser()
2293 parser = xml.etree.ElementTree.XMLParser(target=target)
2294 parser.feed(xml.etree.ElementTree.tostring(node))
2295 return parser.close()
bf6427d2 2296
36e6f62c 2297 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2298 out = []
5bf28d78 2299 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2300
2301 if not paras:
2302 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2303
2304 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2305 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2306 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2307 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2308 if begin_time is None:
2309 continue
7dff0363 2310 if not end_time:
d631d5f9
YCH
2311 if not dur:
2312 continue
2313 end_time = begin_time + dur
bf6427d2
YCH
2314 out.append('%d\n%s --> %s\n%s\n\n' % (
2315 index,
c1c924ab
YCH
2316 srt_subtitles_timecode(begin_time),
2317 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2318 parse_node(para)))
2319
2320 return ''.join(out)
2321
2322
66e289ba
S
2323def cli_option(params, command_option, param):
2324 param = params.get(param)
2325 return [command_option, param] if param is not None else []
2326
2327
2328def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2329 param = params.get(param)
2330 assert isinstance(param, bool)
2331 if separator:
2332 return [command_option + separator + (true_value if param else false_value)]
2333 return [command_option, true_value if param else false_value]
2334
2335
2336def cli_valueless_option(params, command_option, param, expected_value=True):
2337 param = params.get(param)
2338 return [command_option] if param == expected_value else []
2339
2340
2341def cli_configuration_args(params, param, default=[]):
2342 ex_args = params.get(param)
2343 if ex_args is None:
2344 return default
2345 assert isinstance(ex_args, list)
2346 return ex_args
2347
2348
39672624
YCH
2349class ISO639Utils(object):
2350 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2351 _lang_map = {
2352 'aa': 'aar',
2353 'ab': 'abk',
2354 'ae': 'ave',
2355 'af': 'afr',
2356 'ak': 'aka',
2357 'am': 'amh',
2358 'an': 'arg',
2359 'ar': 'ara',
2360 'as': 'asm',
2361 'av': 'ava',
2362 'ay': 'aym',
2363 'az': 'aze',
2364 'ba': 'bak',
2365 'be': 'bel',
2366 'bg': 'bul',
2367 'bh': 'bih',
2368 'bi': 'bis',
2369 'bm': 'bam',
2370 'bn': 'ben',
2371 'bo': 'bod',
2372 'br': 'bre',
2373 'bs': 'bos',
2374 'ca': 'cat',
2375 'ce': 'che',
2376 'ch': 'cha',
2377 'co': 'cos',
2378 'cr': 'cre',
2379 'cs': 'ces',
2380 'cu': 'chu',
2381 'cv': 'chv',
2382 'cy': 'cym',
2383 'da': 'dan',
2384 'de': 'deu',
2385 'dv': 'div',
2386 'dz': 'dzo',
2387 'ee': 'ewe',
2388 'el': 'ell',
2389 'en': 'eng',
2390 'eo': 'epo',
2391 'es': 'spa',
2392 'et': 'est',
2393 'eu': 'eus',
2394 'fa': 'fas',
2395 'ff': 'ful',
2396 'fi': 'fin',
2397 'fj': 'fij',
2398 'fo': 'fao',
2399 'fr': 'fra',
2400 'fy': 'fry',
2401 'ga': 'gle',
2402 'gd': 'gla',
2403 'gl': 'glg',
2404 'gn': 'grn',
2405 'gu': 'guj',
2406 'gv': 'glv',
2407 'ha': 'hau',
2408 'he': 'heb',
2409 'hi': 'hin',
2410 'ho': 'hmo',
2411 'hr': 'hrv',
2412 'ht': 'hat',
2413 'hu': 'hun',
2414 'hy': 'hye',
2415 'hz': 'her',
2416 'ia': 'ina',
2417 'id': 'ind',
2418 'ie': 'ile',
2419 'ig': 'ibo',
2420 'ii': 'iii',
2421 'ik': 'ipk',
2422 'io': 'ido',
2423 'is': 'isl',
2424 'it': 'ita',
2425 'iu': 'iku',
2426 'ja': 'jpn',
2427 'jv': 'jav',
2428 'ka': 'kat',
2429 'kg': 'kon',
2430 'ki': 'kik',
2431 'kj': 'kua',
2432 'kk': 'kaz',
2433 'kl': 'kal',
2434 'km': 'khm',
2435 'kn': 'kan',
2436 'ko': 'kor',
2437 'kr': 'kau',
2438 'ks': 'kas',
2439 'ku': 'kur',
2440 'kv': 'kom',
2441 'kw': 'cor',
2442 'ky': 'kir',
2443 'la': 'lat',
2444 'lb': 'ltz',
2445 'lg': 'lug',
2446 'li': 'lim',
2447 'ln': 'lin',
2448 'lo': 'lao',
2449 'lt': 'lit',
2450 'lu': 'lub',
2451 'lv': 'lav',
2452 'mg': 'mlg',
2453 'mh': 'mah',
2454 'mi': 'mri',
2455 'mk': 'mkd',
2456 'ml': 'mal',
2457 'mn': 'mon',
2458 'mr': 'mar',
2459 'ms': 'msa',
2460 'mt': 'mlt',
2461 'my': 'mya',
2462 'na': 'nau',
2463 'nb': 'nob',
2464 'nd': 'nde',
2465 'ne': 'nep',
2466 'ng': 'ndo',
2467 'nl': 'nld',
2468 'nn': 'nno',
2469 'no': 'nor',
2470 'nr': 'nbl',
2471 'nv': 'nav',
2472 'ny': 'nya',
2473 'oc': 'oci',
2474 'oj': 'oji',
2475 'om': 'orm',
2476 'or': 'ori',
2477 'os': 'oss',
2478 'pa': 'pan',
2479 'pi': 'pli',
2480 'pl': 'pol',
2481 'ps': 'pus',
2482 'pt': 'por',
2483 'qu': 'que',
2484 'rm': 'roh',
2485 'rn': 'run',
2486 'ro': 'ron',
2487 'ru': 'rus',
2488 'rw': 'kin',
2489 'sa': 'san',
2490 'sc': 'srd',
2491 'sd': 'snd',
2492 'se': 'sme',
2493 'sg': 'sag',
2494 'si': 'sin',
2495 'sk': 'slk',
2496 'sl': 'slv',
2497 'sm': 'smo',
2498 'sn': 'sna',
2499 'so': 'som',
2500 'sq': 'sqi',
2501 'sr': 'srp',
2502 'ss': 'ssw',
2503 'st': 'sot',
2504 'su': 'sun',
2505 'sv': 'swe',
2506 'sw': 'swa',
2507 'ta': 'tam',
2508 'te': 'tel',
2509 'tg': 'tgk',
2510 'th': 'tha',
2511 'ti': 'tir',
2512 'tk': 'tuk',
2513 'tl': 'tgl',
2514 'tn': 'tsn',
2515 'to': 'ton',
2516 'tr': 'tur',
2517 'ts': 'tso',
2518 'tt': 'tat',
2519 'tw': 'twi',
2520 'ty': 'tah',
2521 'ug': 'uig',
2522 'uk': 'ukr',
2523 'ur': 'urd',
2524 'uz': 'uzb',
2525 've': 'ven',
2526 'vi': 'vie',
2527 'vo': 'vol',
2528 'wa': 'wln',
2529 'wo': 'wol',
2530 'xh': 'xho',
2531 'yi': 'yid',
2532 'yo': 'yor',
2533 'za': 'zha',
2534 'zh': 'zho',
2535 'zu': 'zul',
2536 }
2537
2538 @classmethod
2539 def short2long(cls, code):
2540 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2541 return cls._lang_map.get(code[:2])
2542
2543 @classmethod
2544 def long2short(cls, code):
2545 """Convert language code from ISO 639-2/T to ISO 639-1"""
2546 for short_name, long_name in cls._lang_map.items():
2547 if long_name == code:
2548 return short_name
2549
2550
4eb10f66
YCH
2551class ISO3166Utils(object):
2552 # From http://data.okfn.org/data/core/country-list
2553 _country_map = {
2554 'AF': 'Afghanistan',
2555 'AX': 'Åland Islands',
2556 'AL': 'Albania',
2557 'DZ': 'Algeria',
2558 'AS': 'American Samoa',
2559 'AD': 'Andorra',
2560 'AO': 'Angola',
2561 'AI': 'Anguilla',
2562 'AQ': 'Antarctica',
2563 'AG': 'Antigua and Barbuda',
2564 'AR': 'Argentina',
2565 'AM': 'Armenia',
2566 'AW': 'Aruba',
2567 'AU': 'Australia',
2568 'AT': 'Austria',
2569 'AZ': 'Azerbaijan',
2570 'BS': 'Bahamas',
2571 'BH': 'Bahrain',
2572 'BD': 'Bangladesh',
2573 'BB': 'Barbados',
2574 'BY': 'Belarus',
2575 'BE': 'Belgium',
2576 'BZ': 'Belize',
2577 'BJ': 'Benin',
2578 'BM': 'Bermuda',
2579 'BT': 'Bhutan',
2580 'BO': 'Bolivia, Plurinational State of',
2581 'BQ': 'Bonaire, Sint Eustatius and Saba',
2582 'BA': 'Bosnia and Herzegovina',
2583 'BW': 'Botswana',
2584 'BV': 'Bouvet Island',
2585 'BR': 'Brazil',
2586 'IO': 'British Indian Ocean Territory',
2587 'BN': 'Brunei Darussalam',
2588 'BG': 'Bulgaria',
2589 'BF': 'Burkina Faso',
2590 'BI': 'Burundi',
2591 'KH': 'Cambodia',
2592 'CM': 'Cameroon',
2593 'CA': 'Canada',
2594 'CV': 'Cape Verde',
2595 'KY': 'Cayman Islands',
2596 'CF': 'Central African Republic',
2597 'TD': 'Chad',
2598 'CL': 'Chile',
2599 'CN': 'China',
2600 'CX': 'Christmas Island',
2601 'CC': 'Cocos (Keeling) Islands',
2602 'CO': 'Colombia',
2603 'KM': 'Comoros',
2604 'CG': 'Congo',
2605 'CD': 'Congo, the Democratic Republic of the',
2606 'CK': 'Cook Islands',
2607 'CR': 'Costa Rica',
2608 'CI': 'Côte d\'Ivoire',
2609 'HR': 'Croatia',
2610 'CU': 'Cuba',
2611 'CW': 'Curaçao',
2612 'CY': 'Cyprus',
2613 'CZ': 'Czech Republic',
2614 'DK': 'Denmark',
2615 'DJ': 'Djibouti',
2616 'DM': 'Dominica',
2617 'DO': 'Dominican Republic',
2618 'EC': 'Ecuador',
2619 'EG': 'Egypt',
2620 'SV': 'El Salvador',
2621 'GQ': 'Equatorial Guinea',
2622 'ER': 'Eritrea',
2623 'EE': 'Estonia',
2624 'ET': 'Ethiopia',
2625 'FK': 'Falkland Islands (Malvinas)',
2626 'FO': 'Faroe Islands',
2627 'FJ': 'Fiji',
2628 'FI': 'Finland',
2629 'FR': 'France',
2630 'GF': 'French Guiana',
2631 'PF': 'French Polynesia',
2632 'TF': 'French Southern Territories',
2633 'GA': 'Gabon',
2634 'GM': 'Gambia',
2635 'GE': 'Georgia',
2636 'DE': 'Germany',
2637 'GH': 'Ghana',
2638 'GI': 'Gibraltar',
2639 'GR': 'Greece',
2640 'GL': 'Greenland',
2641 'GD': 'Grenada',
2642 'GP': 'Guadeloupe',
2643 'GU': 'Guam',
2644 'GT': 'Guatemala',
2645 'GG': 'Guernsey',
2646 'GN': 'Guinea',
2647 'GW': 'Guinea-Bissau',
2648 'GY': 'Guyana',
2649 'HT': 'Haiti',
2650 'HM': 'Heard Island and McDonald Islands',
2651 'VA': 'Holy See (Vatican City State)',
2652 'HN': 'Honduras',
2653 'HK': 'Hong Kong',
2654 'HU': 'Hungary',
2655 'IS': 'Iceland',
2656 'IN': 'India',
2657 'ID': 'Indonesia',
2658 'IR': 'Iran, Islamic Republic of',
2659 'IQ': 'Iraq',
2660 'IE': 'Ireland',
2661 'IM': 'Isle of Man',
2662 'IL': 'Israel',
2663 'IT': 'Italy',
2664 'JM': 'Jamaica',
2665 'JP': 'Japan',
2666 'JE': 'Jersey',
2667 'JO': 'Jordan',
2668 'KZ': 'Kazakhstan',
2669 'KE': 'Kenya',
2670 'KI': 'Kiribati',
2671 'KP': 'Korea, Democratic People\'s Republic of',
2672 'KR': 'Korea, Republic of',
2673 'KW': 'Kuwait',
2674 'KG': 'Kyrgyzstan',
2675 'LA': 'Lao People\'s Democratic Republic',
2676 'LV': 'Latvia',
2677 'LB': 'Lebanon',
2678 'LS': 'Lesotho',
2679 'LR': 'Liberia',
2680 'LY': 'Libya',
2681 'LI': 'Liechtenstein',
2682 'LT': 'Lithuania',
2683 'LU': 'Luxembourg',
2684 'MO': 'Macao',
2685 'MK': 'Macedonia, the Former Yugoslav Republic of',
2686 'MG': 'Madagascar',
2687 'MW': 'Malawi',
2688 'MY': 'Malaysia',
2689 'MV': 'Maldives',
2690 'ML': 'Mali',
2691 'MT': 'Malta',
2692 'MH': 'Marshall Islands',
2693 'MQ': 'Martinique',
2694 'MR': 'Mauritania',
2695 'MU': 'Mauritius',
2696 'YT': 'Mayotte',
2697 'MX': 'Mexico',
2698 'FM': 'Micronesia, Federated States of',
2699 'MD': 'Moldova, Republic of',
2700 'MC': 'Monaco',
2701 'MN': 'Mongolia',
2702 'ME': 'Montenegro',
2703 'MS': 'Montserrat',
2704 'MA': 'Morocco',
2705 'MZ': 'Mozambique',
2706 'MM': 'Myanmar',
2707 'NA': 'Namibia',
2708 'NR': 'Nauru',
2709 'NP': 'Nepal',
2710 'NL': 'Netherlands',
2711 'NC': 'New Caledonia',
2712 'NZ': 'New Zealand',
2713 'NI': 'Nicaragua',
2714 'NE': 'Niger',
2715 'NG': 'Nigeria',
2716 'NU': 'Niue',
2717 'NF': 'Norfolk Island',
2718 'MP': 'Northern Mariana Islands',
2719 'NO': 'Norway',
2720 'OM': 'Oman',
2721 'PK': 'Pakistan',
2722 'PW': 'Palau',
2723 'PS': 'Palestine, State of',
2724 'PA': 'Panama',
2725 'PG': 'Papua New Guinea',
2726 'PY': 'Paraguay',
2727 'PE': 'Peru',
2728 'PH': 'Philippines',
2729 'PN': 'Pitcairn',
2730 'PL': 'Poland',
2731 'PT': 'Portugal',
2732 'PR': 'Puerto Rico',
2733 'QA': 'Qatar',
2734 'RE': 'Réunion',
2735 'RO': 'Romania',
2736 'RU': 'Russian Federation',
2737 'RW': 'Rwanda',
2738 'BL': 'Saint Barthélemy',
2739 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2740 'KN': 'Saint Kitts and Nevis',
2741 'LC': 'Saint Lucia',
2742 'MF': 'Saint Martin (French part)',
2743 'PM': 'Saint Pierre and Miquelon',
2744 'VC': 'Saint Vincent and the Grenadines',
2745 'WS': 'Samoa',
2746 'SM': 'San Marino',
2747 'ST': 'Sao Tome and Principe',
2748 'SA': 'Saudi Arabia',
2749 'SN': 'Senegal',
2750 'RS': 'Serbia',
2751 'SC': 'Seychelles',
2752 'SL': 'Sierra Leone',
2753 'SG': 'Singapore',
2754 'SX': 'Sint Maarten (Dutch part)',
2755 'SK': 'Slovakia',
2756 'SI': 'Slovenia',
2757 'SB': 'Solomon Islands',
2758 'SO': 'Somalia',
2759 'ZA': 'South Africa',
2760 'GS': 'South Georgia and the South Sandwich Islands',
2761 'SS': 'South Sudan',
2762 'ES': 'Spain',
2763 'LK': 'Sri Lanka',
2764 'SD': 'Sudan',
2765 'SR': 'Suriname',
2766 'SJ': 'Svalbard and Jan Mayen',
2767 'SZ': 'Swaziland',
2768 'SE': 'Sweden',
2769 'CH': 'Switzerland',
2770 'SY': 'Syrian Arab Republic',
2771 'TW': 'Taiwan, Province of China',
2772 'TJ': 'Tajikistan',
2773 'TZ': 'Tanzania, United Republic of',
2774 'TH': 'Thailand',
2775 'TL': 'Timor-Leste',
2776 'TG': 'Togo',
2777 'TK': 'Tokelau',
2778 'TO': 'Tonga',
2779 'TT': 'Trinidad and Tobago',
2780 'TN': 'Tunisia',
2781 'TR': 'Turkey',
2782 'TM': 'Turkmenistan',
2783 'TC': 'Turks and Caicos Islands',
2784 'TV': 'Tuvalu',
2785 'UG': 'Uganda',
2786 'UA': 'Ukraine',
2787 'AE': 'United Arab Emirates',
2788 'GB': 'United Kingdom',
2789 'US': 'United States',
2790 'UM': 'United States Minor Outlying Islands',
2791 'UY': 'Uruguay',
2792 'UZ': 'Uzbekistan',
2793 'VU': 'Vanuatu',
2794 'VE': 'Venezuela, Bolivarian Republic of',
2795 'VN': 'Viet Nam',
2796 'VG': 'Virgin Islands, British',
2797 'VI': 'Virgin Islands, U.S.',
2798 'WF': 'Wallis and Futuna',
2799 'EH': 'Western Sahara',
2800 'YE': 'Yemen',
2801 'ZM': 'Zambia',
2802 'ZW': 'Zimbabwe',
2803 }
2804
2805 @classmethod
2806 def short2full(cls, code):
2807 """Convert an ISO 3166-2 country code to the corresponding full name"""
2808 return cls._country_map.get(code.upper())
2809
2810
91410c9b 2811class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2812 def __init__(self, proxies=None):
2813 # Set default handlers
2814 for type in ('http', 'https'):
2815 setattr(self, '%s_open' % type,
2816 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2817 meth(r, proxy, type))
2818 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2819
91410c9b 2820 def proxy_open(self, req, proxy, type):
2461f79d 2821 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2822 if req_proxy is not None:
2823 proxy = req_proxy
2461f79d
PH
2824 del req.headers['Ytdl-request-proxy']
2825
2826 if proxy == '__noproxy__':
2827 return None # No Proxy
51fb4995 2828 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2829 req.add_header('Ytdl-socks-proxy', proxy)
2830 # youtube-dl's http/https handlers do wrapping the socket with socks
2831 return None
91410c9b
PH
2832 return compat_urllib_request.ProxyHandler.proxy_open(
2833 self, req, proxy, type)
5bc880b9
YCH
2834
2835
2836def ohdave_rsa_encrypt(data, exponent, modulus):
2837 '''
2838 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2839
2840 Input:
2841 data: data to encrypt, bytes-like object
2842 exponent, modulus: parameter e and N of RSA algorithm, both integer
2843 Output: hex string of encrypted data
2844
2845 Limitation: supports one block encryption only
2846 '''
2847
2848 payload = int(binascii.hexlify(data[::-1]), 16)
2849 encrypted = pow(payload, exponent, modulus)
2850 return '%x' % encrypted
81bdc8fd
YCH
2851
2852
5eb6bdce 2853def encode_base_n(num, n, table=None):
59f898b7 2854 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2855 if not table:
2856 table = FULL_TABLE[:n]
2857
5eb6bdce
YCH
2858 if n > len(table):
2859 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2860
2861 if num == 0:
2862 return table[0]
2863
81bdc8fd
YCH
2864 ret = ''
2865 while num:
2866 ret = table[num % n] + ret
2867 num = num // n
2868 return ret
f52354a8
YCH
2869
2870
2871def decode_packed_codes(code):
2872 mobj = re.search(
680079be 2873 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2874 code)
2875 obfucasted_code, base, count, symbols = mobj.groups()
2876 base = int(base)
2877 count = int(count)
2878 symbols = symbols.split('|')
2879 symbol_table = {}
2880
2881 while count:
2882 count -= 1
5eb6bdce 2883 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2884 symbol_table[base_n_count] = symbols[count] or base_n_count
2885
2886 return re.sub(
2887 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2888 obfucasted_code)
e154c651 2889
2890
2891def parse_m3u8_attributes(attrib):
2892 info = {}
2893 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2894 if val.startswith('"'):
2895 val = val[1:-1]
2896 info[key] = val
2897 return info