]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[ooyala] fix typo
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
fb37eb25
S
89USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91}
92
93
bf42a990
S
94NO_DEFAULT = object()
95
7105440c
YCH
96ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
f6717dec
S
100MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
3e4185c3
S
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 105}
a942d6cb 106
a7aaa398
S
107KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
c587cbb7 122# needed for sanitizing filenames in restricted mode
c8827027 123ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 126
46f59e89
S
127DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
cb655f34
S
131 '%B %dst %Y',
132 '%B %dnd %Y',
133 '%B %dth %Y',
46f59e89 134 '%b %d %Y',
cb655f34
S
135 '%b %dst %Y',
136 '%b %dnd %Y',
137 '%b %dth %Y',
46f59e89
S
138 '%b %dst %Y %I:%M',
139 '%b %dnd %Y %I:%M',
140 '%b %dth %Y %I:%M',
141 '%Y %m %d',
142 '%Y-%m-%d',
143 '%Y/%m/%d',
81c13222 144 '%Y/%m/%d %H:%M',
46f59e89
S
145 '%Y/%m/%d %H:%M:%S',
146 '%Y-%m-%d %H:%M:%S',
147 '%Y-%m-%d %H:%M:%S.%f',
148 '%d.%m.%Y %H:%M',
149 '%d.%m.%Y %H.%M',
150 '%Y-%m-%dT%H:%M:%SZ',
151 '%Y-%m-%dT%H:%M:%S.%fZ',
152 '%Y-%m-%dT%H:%M:%S.%f0Z',
153 '%Y-%m-%dT%H:%M:%S',
154 '%Y-%m-%dT%H:%M:%S.%f',
155 '%Y-%m-%dT%H:%M',
c6eed6b8
S
156 '%b %d %Y at %H:%M',
157 '%b %d %Y at %H:%M:%S',
46f59e89
S
158)
159
160DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
161DATE_FORMATS_DAY_FIRST.extend([
162 '%d-%m-%Y',
163 '%d.%m.%Y',
164 '%d.%m.%y',
165 '%d/%m/%Y',
166 '%d/%m/%y',
167 '%d/%m/%Y %H:%M:%S',
168])
169
170DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
171DATE_FORMATS_MONTH_FIRST.extend([
172 '%m-%d-%Y',
173 '%m.%d.%Y',
174 '%m/%d/%Y',
175 '%m/%d/%y',
176 '%m/%d/%Y %H:%M:%S',
177])
178
06b3fe29
S
179PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
180
7105440c 181
d77c3dfd 182def preferredencoding():
59ae15a5 183 """Get preferred encoding.
d77c3dfd 184
59ae15a5
PH
185 Returns the best encoding scheme for the system, based on
186 locale.getpreferredencoding() and some further tweaks.
187 """
188 try:
189 pref = locale.getpreferredencoding()
28e614de 190 'TEST'.encode(pref)
70a1165b 191 except Exception:
59ae15a5 192 pref = 'UTF-8'
bae611f2 193
59ae15a5 194 return pref
d77c3dfd 195
f4bfd65f 196
181c8655 197def write_json_file(obj, fn):
1394646a 198 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 199
92120217 200 fn = encodeFilename(fn)
61ee5aeb 201 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
202 encoding = get_filesystem_encoding()
203 # os.path.basename returns a bytes object, but NamedTemporaryFile
204 # will fail if the filename contains non ascii characters unless we
205 # use a unicode object
206 path_basename = lambda f: os.path.basename(fn).decode(encoding)
207 # the same for os.path.dirname
208 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
209 else:
210 path_basename = os.path.basename
211 path_dirname = os.path.dirname
212
73159f99
S
213 args = {
214 'suffix': '.tmp',
ec5f6016
JMF
215 'prefix': path_basename(fn) + '.',
216 'dir': path_dirname(fn),
73159f99
S
217 'delete': False,
218 }
219
181c8655
PH
220 # In Python 2.x, json.dump expects a bytestream.
221 # In Python 3.x, it writes to a character stream
222 if sys.version_info < (3, 0):
73159f99 223 args['mode'] = 'wb'
181c8655 224 else:
73159f99
S
225 args.update({
226 'mode': 'w',
227 'encoding': 'utf-8',
228 })
229
c86b6142 230 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
231
232 try:
233 with tf:
234 json.dump(obj, tf)
1394646a
IK
235 if sys.platform == 'win32':
236 # Need to remove existing file on Windows, else os.rename raises
237 # WindowsError or FileExistsError.
238 try:
239 os.unlink(fn)
240 except OSError:
241 pass
181c8655 242 os.rename(tf.name, fn)
70a1165b 243 except Exception:
181c8655
PH
244 try:
245 os.remove(tf.name)
246 except OSError:
247 pass
248 raise
249
250
251if sys.version_info >= (2, 7):
ee114368 252 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 253 """ Find the xpath xpath[@key=val] """
5d2354f1 254 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 255 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
256 return node.find(expr)
257else:
ee114368 258 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 259 for f in node.findall(compat_xpath(xpath)):
ee114368
S
260 if key not in f.attrib:
261 continue
262 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
263 return f
264 return None
265
d7e66d39
JMF
266# On python2.6 the xml.etree.ElementTree.Element methods don't support
267# the namespace parameter
5f6a1245
JW
268
269
d7e66d39
JMF
270def xpath_with_ns(path, ns_map):
271 components = [c.split(':') for c in path.split('/')]
272 replaced = []
273 for c in components:
274 if len(c) == 1:
275 replaced.append(c[0])
276 else:
277 ns, tag = c
278 replaced.append('{%s}%s' % (ns_map[ns], tag))
279 return '/'.join(replaced)
280
d77c3dfd 281
a41fb80c 282def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 283 def _find_xpath(xpath):
810c10ba 284 return node.find(compat_xpath(xpath))
578c0745
S
285
286 if isinstance(xpath, (str, compat_str)):
287 n = _find_xpath(xpath)
288 else:
289 for xp in xpath:
290 n = _find_xpath(xp)
291 if n is not None:
292 break
d74bebd5 293
8e636da4 294 if n is None:
bf42a990
S
295 if default is not NO_DEFAULT:
296 return default
297 elif fatal:
bf0ff932
PH
298 name = xpath if name is None else name
299 raise ExtractorError('Could not find XML element %s' % name)
300 else:
301 return None
a41fb80c
S
302 return n
303
304
305def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
306 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
307 if n is None or n == default:
308 return n
309 if n.text is None:
310 if default is not NO_DEFAULT:
311 return default
312 elif fatal:
313 name = xpath if name is None else name
314 raise ExtractorError('Could not find XML element\'s text %s' % name)
315 else:
316 return None
317 return n.text
a41fb80c
S
318
319
320def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
321 n = find_xpath_attr(node, xpath, key)
322 if n is None:
323 if default is not NO_DEFAULT:
324 return default
325 elif fatal:
326 name = '%s[@%s]' % (xpath, key) if name is None else name
327 raise ExtractorError('Could not find XML attribute %s' % name)
328 else:
329 return None
330 return n.attrib[key]
bf0ff932
PH
331
332
9e6dd238 333def get_element_by_id(id, html):
43e8fafd 334 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 335 return get_element_by_attribute('id', id, html)
43e8fafd 336
12ea2f30 337
84c237fb
YCH
338def get_element_by_class(class_name, html):
339 return get_element_by_attribute(
340 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
341 html, escape_value=False)
342
343
344def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 345 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 346
84c237fb
YCH
347 value = re.escape(value) if escape_value else value
348
38285056
PH
349 m = re.search(r'''(?xs)
350 <([a-zA-Z0-9:._-]+)
abc97b5e 351 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 352 \s+%s=['"]?%s['"]?
abc97b5e 353 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
354 \s*>
355 (?P<content>.*?)
356 </\1>
84c237fb 357 ''' % (re.escape(attribute), value), html)
38285056
PH
358
359 if not m:
360 return None
361 res = m.group('content')
362
363 if res.startswith('"') or res.startswith("'"):
364 res = res[1:-1]
a921f407 365
38285056 366 return unescapeHTML(res)
a921f407 367
c5229f39 368
8bb56eee
BF
369class HTMLAttributeParser(compat_HTMLParser):
370 """Trivial HTML parser to gather the attributes for a single element"""
371 def __init__(self):
c5229f39 372 self.attrs = {}
8bb56eee
BF
373 compat_HTMLParser.__init__(self)
374
375 def handle_starttag(self, tag, attrs):
376 self.attrs = dict(attrs)
377
c5229f39 378
8bb56eee
BF
379def extract_attributes(html_element):
380 """Given a string for an HTML element such as
381 <el
382 a="foo" B="bar" c="&98;az" d=boz
383 empty= noval entity="&amp;"
384 sq='"' dq="'"
385 >
386 Decode and return a dictionary of attributes.
387 {
388 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
389 'empty': '', 'noval': None, 'entity': '&',
390 'sq': '"', 'dq': '\''
391 }.
392 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
393 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
394 """
395 parser = HTMLAttributeParser()
396 parser.feed(html_element)
397 parser.close()
398 return parser.attrs
9e6dd238 399
c5229f39 400
9e6dd238 401def clean_html(html):
59ae15a5 402 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
403
404 if html is None: # Convenience for sanitizing descriptions etc.
405 return html
406
59ae15a5
PH
407 # Newline vs <br />
408 html = html.replace('\n', ' ')
6b3aef80
FV
409 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
410 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
411 # Strip html tags
412 html = re.sub('<.*?>', '', html)
413 # Replace html entities
414 html = unescapeHTML(html)
7decf895 415 return html.strip()
9e6dd238
FV
416
417
d77c3dfd 418def sanitize_open(filename, open_mode):
59ae15a5
PH
419 """Try to open the given filename, and slightly tweak it if this fails.
420
421 Attempts to open the given filename. If this fails, it tries to change
422 the filename slightly, step by step, until it's either able to open it
423 or it fails and raises a final exception, like the standard open()
424 function.
425
426 It returns the tuple (stream, definitive_file_name).
427 """
428 try:
28e614de 429 if filename == '-':
59ae15a5
PH
430 if sys.platform == 'win32':
431 import msvcrt
432 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 433 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
434 stream = open(encodeFilename(filename), open_mode)
435 return (stream, filename)
436 except (IOError, OSError) as err:
f45c185f
PH
437 if err.errno in (errno.EACCES,):
438 raise
59ae15a5 439
f45c185f 440 # In case of error, try to remove win32 forbidden chars
d55de57b 441 alt_filename = sanitize_path(filename)
f45c185f
PH
442 if alt_filename == filename:
443 raise
444 else:
445 # An exception here should be caught in the caller
d55de57b 446 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 447 return (stream, alt_filename)
d77c3dfd
FV
448
449
450def timeconvert(timestr):
59ae15a5
PH
451 """Convert RFC 2822 defined time string into system timestamp"""
452 timestamp = None
453 timetuple = email.utils.parsedate_tz(timestr)
454 if timetuple is not None:
455 timestamp = email.utils.mktime_tz(timetuple)
456 return timestamp
1c469a94 457
5f6a1245 458
796173d0 459def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
460 """Sanitizes a string so it could be used as part of a filename.
461 If restricted is set, use a stricter subset of allowed characters.
796173d0 462 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
463 """
464 def replace_insane(char):
c587cbb7
AT
465 if restricted and char in ACCENT_CHARS:
466 return ACCENT_CHARS[char]
59ae15a5
PH
467 if char == '?' or ord(char) < 32 or ord(char) == 127:
468 return ''
469 elif char == '"':
470 return '' if restricted else '\''
471 elif char == ':':
472 return '_-' if restricted else ' -'
473 elif char in '\\/|*<>':
474 return '_'
627dcfff 475 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
476 return '_'
477 if restricted and ord(char) > 127:
478 return '_'
479 return char
480
2aeb06d6
PH
481 # Handle timestamps
482 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 483 result = ''.join(map(replace_insane, s))
796173d0
PH
484 if not is_id:
485 while '__' in result:
486 result = result.replace('__', '_')
487 result = result.strip('_')
488 # Common case of "Foreign band name - English song title"
489 if restricted and result.startswith('-_'):
490 result = result[2:]
5a42414b
PH
491 if result.startswith('-'):
492 result = '_' + result[len('-'):]
a7440261 493 result = result.lstrip('.')
796173d0
PH
494 if not result:
495 result = '_'
59ae15a5 496 return result
d77c3dfd 497
5f6a1245 498
a2aaf4db
S
499def sanitize_path(s):
500 """Sanitizes and normalizes path on Windows"""
501 if sys.platform != 'win32':
502 return s
be531ef1
S
503 drive_or_unc, _ = os.path.splitdrive(s)
504 if sys.version_info < (2, 7) and not drive_or_unc:
505 drive_or_unc, _ = os.path.splitunc(s)
506 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
507 if drive_or_unc:
a2aaf4db
S
508 norm_path.pop(0)
509 sanitized_path = [
ec85ded8 510 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 511 for path_part in norm_path]
be531ef1
S
512 if drive_or_unc:
513 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
514 return os.path.join(*sanitized_path)
515
516
67dda517
S
517# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
518# unwanted failures due to missing protocol
17bcc626
S
519def sanitize_url(url):
520 return 'http:%s' % url if url.startswith('//') else url
521
522
67dda517 523def sanitized_Request(url, *args, **kwargs):
17bcc626 524 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
525
526
d77c3dfd 527def orderedSet(iterable):
59ae15a5
PH
528 """ Remove all duplicates from the input iterable """
529 res = []
530 for el in iterable:
531 if el not in res:
532 res.append(el)
533 return res
d77c3dfd 534
912b38b4 535
55b2f099 536def _htmlentity_transform(entity_with_semicolon):
4e408e47 537 """Transforms an HTML entity to a character."""
55b2f099
YCH
538 entity = entity_with_semicolon[:-1]
539
4e408e47
PH
540 # Known non-numeric HTML entity
541 if entity in compat_html_entities.name2codepoint:
542 return compat_chr(compat_html_entities.name2codepoint[entity])
543
55b2f099
YCH
544 # TODO: HTML5 allows entities without a semicolon. For example,
545 # '&Eacuteric' should be decoded as 'Éric'.
546 if entity_with_semicolon in compat_html_entities_html5:
547 return compat_html_entities_html5[entity_with_semicolon]
548
91757b0f 549 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
550 if mobj is not None:
551 numstr = mobj.group(1)
28e614de 552 if numstr.startswith('x'):
4e408e47 553 base = 16
28e614de 554 numstr = '0%s' % numstr
4e408e47
PH
555 else:
556 base = 10
7aefc49c
S
557 # See https://github.com/rg3/youtube-dl/issues/7518
558 try:
559 return compat_chr(int(numstr, base))
560 except ValueError:
561 pass
4e408e47
PH
562
563 # Unknown entity in name, return its literal representation
7a3f0c00 564 return '&%s;' % entity
4e408e47
PH
565
566
d77c3dfd 567def unescapeHTML(s):
912b38b4
PH
568 if s is None:
569 return None
570 assert type(s) == compat_str
d77c3dfd 571
4e408e47 572 return re.sub(
55b2f099 573 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 574
8bf48f23 575
aa49acd1
S
576def get_subprocess_encoding():
577 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
578 # For subprocess calls, encode with locale encoding
579 # Refer to http://stackoverflow.com/a/9951851/35070
580 encoding = preferredencoding()
581 else:
582 encoding = sys.getfilesystemencoding()
583 if encoding is None:
584 encoding = 'utf-8'
585 return encoding
586
587
8bf48f23 588def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
589 """
590 @param s The name of the file
591 """
d77c3dfd 592
8bf48f23 593 assert type(s) == compat_str
d77c3dfd 594
59ae15a5
PH
595 # Python 3 has a Unicode API
596 if sys.version_info >= (3, 0):
597 return s
0f00efed 598
aa49acd1
S
599 # Pass '' directly to use Unicode APIs on Windows 2000 and up
600 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
601 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
602 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
603 return s
604
8ee239e9
YCH
605 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
606 if sys.platform.startswith('java'):
607 return s
608
aa49acd1
S
609 return s.encode(get_subprocess_encoding(), 'ignore')
610
611
612def decodeFilename(b, for_subprocess=False):
613
614 if sys.version_info >= (3, 0):
615 return b
616
617 if not isinstance(b, bytes):
618 return b
619
620 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 621
f07b74fc
PH
622
623def encodeArgument(s):
624 if not isinstance(s, compat_str):
625 # Legacy code that uses byte strings
626 # Uncomment the following line after fixing all post processors
7af808a5 627 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
628 s = s.decode('ascii')
629 return encodeFilename(s, True)
630
631
aa49acd1
S
632def decodeArgument(b):
633 return decodeFilename(b, True)
634
635
8271226a
PH
636def decodeOption(optval):
637 if optval is None:
638 return optval
639 if isinstance(optval, bytes):
640 optval = optval.decode(preferredencoding())
641
642 assert isinstance(optval, compat_str)
643 return optval
1c256f70 644
5f6a1245 645
4539dd30
PH
646def formatSeconds(secs):
647 if secs > 3600:
648 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
649 elif secs > 60:
650 return '%d:%02d' % (secs // 60, secs % 60)
651 else:
652 return '%d' % secs
653
a0ddb8a2 654
be4a824d
PH
655def make_HTTPS_handler(params, **kwargs):
656 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 657 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 658 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 659 if opts_no_check_certificate:
be5f2c19 660 context.check_hostname = False
0db261ba 661 context.verify_mode = ssl.CERT_NONE
a2366922 662 try:
be4a824d 663 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
664 except TypeError:
665 # Python 2.7.8
666 # (create_default_context present but HTTPSHandler has no context=)
667 pass
668
669 if sys.version_info < (3, 2):
d7932313 670 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 671 else: # Python < 3.4
d7932313 672 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 673 context.verify_mode = (ssl.CERT_NONE
dca08720 674 if opts_no_check_certificate
ea6d901e 675 else ssl.CERT_REQUIRED)
303b479e 676 context.set_default_verify_paths()
be4a824d 677 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 678
732ea2f0 679
08f2a92c
JMF
680def bug_reports_message():
681 if ytdl_is_updateable():
682 update_cmd = 'type youtube-dl -U to update'
683 else:
684 update_cmd = 'see https://yt-dl.org/update on how to update'
685 msg = '; please report this issue on https://yt-dl.org/bug .'
686 msg += ' Make sure you are using the latest version; %s.' % update_cmd
687 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
688 return msg
689
690
1c256f70
PH
691class ExtractorError(Exception):
692 """Error during info extraction."""
5f6a1245 693
d11271dd 694 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
695 """ tb, if given, is the original traceback (so that it can be printed out).
696 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
697 """
698
699 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
700 expected = True
d11271dd
PH
701 if video_id is not None:
702 msg = video_id + ': ' + msg
410f3e73 703 if cause:
28e614de 704 msg += ' (caused by %r)' % cause
9a82b238 705 if not expected:
08f2a92c 706 msg += bug_reports_message()
1c256f70 707 super(ExtractorError, self).__init__(msg)
d5979c5d 708
1c256f70 709 self.traceback = tb
8cc83b8d 710 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 711 self.cause = cause
d11271dd 712 self.video_id = video_id
1c256f70 713
01951dda
PH
714 def format_traceback(self):
715 if self.traceback is None:
716 return None
28e614de 717 return ''.join(traceback.format_tb(self.traceback))
01951dda 718
1c256f70 719
416c7fcb
PH
720class UnsupportedError(ExtractorError):
721 def __init__(self, url):
722 super(UnsupportedError, self).__init__(
723 'Unsupported URL: %s' % url, expected=True)
724 self.url = url
725
726
55b3e45b
JMF
727class RegexNotFoundError(ExtractorError):
728 """Error when a regex didn't match"""
729 pass
730
731
d77c3dfd 732class DownloadError(Exception):
59ae15a5 733 """Download Error exception.
d77c3dfd 734
59ae15a5
PH
735 This exception may be thrown by FileDownloader objects if they are not
736 configured to continue on errors. They will contain the appropriate
737 error message.
738 """
5f6a1245 739
8cc83b8d
FV
740 def __init__(self, msg, exc_info=None):
741 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
742 super(DownloadError, self).__init__(msg)
743 self.exc_info = exc_info
d77c3dfd
FV
744
745
746class SameFileError(Exception):
59ae15a5 747 """Same File exception.
d77c3dfd 748
59ae15a5
PH
749 This exception will be thrown by FileDownloader objects if they detect
750 multiple files would have to be downloaded to the same file on disk.
751 """
752 pass
d77c3dfd
FV
753
754
755class PostProcessingError(Exception):
59ae15a5 756 """Post Processing exception.
d77c3dfd 757
59ae15a5
PH
758 This exception may be raised by PostProcessor's .run() method to
759 indicate an error in the postprocessing task.
760 """
5f6a1245 761
7851b379
PH
762 def __init__(self, msg):
763 self.msg = msg
d77c3dfd 764
5f6a1245 765
d77c3dfd 766class MaxDownloadsReached(Exception):
59ae15a5
PH
767 """ --max-downloads limit has been reached. """
768 pass
d77c3dfd
FV
769
770
771class UnavailableVideoError(Exception):
59ae15a5 772 """Unavailable Format exception.
d77c3dfd 773
59ae15a5
PH
774 This exception will be thrown when a video is requested
775 in a format that is not available for that video.
776 """
777 pass
d77c3dfd
FV
778
779
780class ContentTooShortError(Exception):
59ae15a5 781 """Content Too Short exception.
d77c3dfd 782
59ae15a5
PH
783 This exception may be raised by FileDownloader objects when a file they
784 download is too small for what the server announced first, indicating
785 the connection was probably interrupted.
786 """
d77c3dfd 787
59ae15a5 788 def __init__(self, downloaded, expected):
2c7ed247 789 # Both in bytes
59ae15a5
PH
790 self.downloaded = downloaded
791 self.expected = expected
d77c3dfd 792
5f6a1245 793
efa97bdc
YCH
794class XAttrMetadataError(Exception):
795 def __init__(self, code=None, msg='Unknown error'):
796 super(XAttrMetadataError, self).__init__(msg)
797 self.code = code
bd264412 798 self.msg = msg
efa97bdc
YCH
799
800 # Parsing code and msg
801 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
802 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
803 self.reason = 'NO_SPACE'
804 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
805 self.reason = 'VALUE_TOO_LONG'
806 else:
807 self.reason = 'NOT_SUPPORTED'
808
809
810class XAttrUnavailableError(Exception):
811 pass
812
813
c5a59d93 814def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
815 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
816 # expected HTTP responses to meet HTTP/1.0 or later (see also
817 # https://github.com/rg3/youtube-dl/issues/6727)
818 if sys.version_info < (3, 0):
5a1a2e94 819 kwargs[b'strict'] = True
be4a824d
PH
820 hc = http_class(*args, **kwargs)
821 source_address = ydl_handler._params.get('source_address')
822 if source_address is not None:
823 sa = (source_address, 0)
824 if hasattr(hc, 'source_address'): # Python 2.7+
825 hc.source_address = sa
826 else: # Python 2.6
827 def _hc_connect(self, *args, **kwargs):
828 sock = compat_socket_create_connection(
829 (self.host, self.port), self.timeout, sa)
830 if is_https:
d7932313
PH
831 self.sock = ssl.wrap_socket(
832 sock, self.key_file, self.cert_file,
833 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
834 else:
835 self.sock = sock
836 hc.connect = functools.partial(_hc_connect, hc)
837
838 return hc
839
840
87f0e62d 841def handle_youtubedl_headers(headers):
992fc9d6
YCH
842 filtered_headers = headers
843
844 if 'Youtubedl-no-compression' in filtered_headers:
845 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 846 del filtered_headers['Youtubedl-no-compression']
87f0e62d 847
992fc9d6 848 return filtered_headers
87f0e62d
YCH
849
850
acebc9cd 851class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
852 """Handler for HTTP requests and responses.
853
854 This class, when installed with an OpenerDirector, automatically adds
855 the standard headers to every HTTP request and handles gzipped and
856 deflated responses from web servers. If compression is to be avoided in
857 a particular request, the original request in the program code only has
0424ec30 858 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
859 removed before making the real request.
860
861 Part of this code was copied from:
862
863 http://techknack.net/python-urllib2-handlers/
864
865 Andrew Rowls, the author of that code, agreed to release it to the
866 public domain.
867 """
868
be4a824d
PH
869 def __init__(self, params, *args, **kwargs):
870 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
871 self._params = params
872
873 def http_open(self, req):
71aff188
YCH
874 conn_class = compat_http_client.HTTPConnection
875
876 socks_proxy = req.headers.get('Ytdl-socks-proxy')
877 if socks_proxy:
878 conn_class = make_socks_conn_class(conn_class, socks_proxy)
879 del req.headers['Ytdl-socks-proxy']
880
be4a824d 881 return self.do_open(functools.partial(
71aff188 882 _create_http_connection, self, conn_class, False),
be4a824d
PH
883 req)
884
59ae15a5
PH
885 @staticmethod
886 def deflate(data):
887 try:
888 return zlib.decompress(data, -zlib.MAX_WBITS)
889 except zlib.error:
890 return zlib.decompress(data)
891
892 @staticmethod
893 def addinfourl_wrapper(stream, headers, url, code):
894 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
895 return compat_urllib_request.addinfourl(stream, headers, url, code)
896 ret = compat_urllib_request.addinfourl(stream, headers, url)
897 ret.code = code
898 return ret
899
acebc9cd 900 def http_request(self, req):
51f267d9
S
901 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
902 # always respected by websites, some tend to give out URLs with non percent-encoded
903 # non-ASCII characters (see telemb.py, ard.py [#3412])
904 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
905 # To work around aforementioned issue we will replace request's original URL with
906 # percent-encoded one
907 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
908 # the code of this workaround has been moved here from YoutubeDL.urlopen()
909 url = req.get_full_url()
910 url_escaped = escape_url(url)
911
912 # Substitute URL if any change after escaping
913 if url != url_escaped:
15d260eb 914 req = update_Request(req, url=url_escaped)
51f267d9 915
33ac271b 916 for h, v in std_headers.items():
3d5f7a39
JK
917 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
918 # The dict keys are capitalized because of this bug by urllib
919 if h.capitalize() not in req.headers:
33ac271b 920 req.add_header(h, v)
87f0e62d
YCH
921
922 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
923
924 if sys.version_info < (2, 7) and '#' in req.get_full_url():
925 # Python 2.6 is brain-dead when it comes to fragments
926 req._Request__original = req._Request__original.partition('#')[0]
927 req._Request__r_type = req._Request__r_type.partition('#')[0]
928
59ae15a5
PH
929 return req
930
acebc9cd 931 def http_response(self, req, resp):
59ae15a5
PH
932 old_resp = resp
933 # gzip
934 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
935 content = resp.read()
936 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
937 try:
938 uncompressed = io.BytesIO(gz.read())
939 except IOError as original_ioerror:
940 # There may be junk add the end of the file
941 # See http://stackoverflow.com/q/4928560/35070 for details
942 for i in range(1, 1024):
943 try:
944 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
945 uncompressed = io.BytesIO(gz.read())
946 except IOError:
947 continue
948 break
949 else:
950 raise original_ioerror
951 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 952 resp.msg = old_resp.msg
c047270c 953 del resp.headers['Content-encoding']
59ae15a5
PH
954 # deflate
955 if resp.headers.get('Content-encoding', '') == 'deflate':
956 gz = io.BytesIO(self.deflate(resp.read()))
957 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
958 resp.msg = old_resp.msg
c047270c 959 del resp.headers['Content-encoding']
ad729172
S
960 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
961 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
962 if 300 <= resp.code < 400:
963 location = resp.headers.get('Location')
964 if location:
965 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
966 if sys.version_info >= (3, 0):
967 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
968 else:
969 location = location.decode('utf-8')
5a4d9ddb
S
970 location_escaped = escape_url(location)
971 if location != location_escaped:
972 del resp.headers['Location']
9a4aec8b
YCH
973 if sys.version_info < (3, 0):
974 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 975 resp.headers['Location'] = location_escaped
59ae15a5 976 return resp
0f8d03f8 977
acebc9cd
PH
978 https_request = http_request
979 https_response = http_response
bf50b038 980
5de90176 981
71aff188
YCH
982def make_socks_conn_class(base_class, socks_proxy):
983 assert issubclass(base_class, (
984 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
985
986 url_components = compat_urlparse.urlparse(socks_proxy)
987 if url_components.scheme.lower() == 'socks5':
988 socks_type = ProxyType.SOCKS5
989 elif url_components.scheme.lower() in ('socks', 'socks4'):
990 socks_type = ProxyType.SOCKS4
51fb4995
YCH
991 elif url_components.scheme.lower() == 'socks4a':
992 socks_type = ProxyType.SOCKS4A
71aff188 993
cdd94c2e
YCH
994 def unquote_if_non_empty(s):
995 if not s:
996 return s
997 return compat_urllib_parse_unquote_plus(s)
998
71aff188
YCH
999 proxy_args = (
1000 socks_type,
1001 url_components.hostname, url_components.port or 1080,
1002 True, # Remote DNS
cdd94c2e
YCH
1003 unquote_if_non_empty(url_components.username),
1004 unquote_if_non_empty(url_components.password),
71aff188
YCH
1005 )
1006
1007 class SocksConnection(base_class):
1008 def connect(self):
1009 self.sock = sockssocket()
1010 self.sock.setproxy(*proxy_args)
1011 if type(self.timeout) in (int, float):
1012 self.sock.settimeout(self.timeout)
1013 self.sock.connect((self.host, self.port))
1014
1015 if isinstance(self, compat_http_client.HTTPSConnection):
1016 if hasattr(self, '_context'): # Python > 2.6
1017 self.sock = self._context.wrap_socket(
1018 self.sock, server_hostname=self.host)
1019 else:
1020 self.sock = ssl.wrap_socket(self.sock)
1021
1022 return SocksConnection
1023
1024
be4a824d
PH
1025class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1026 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1027 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1028 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1029 self._params = params
1030
1031 def https_open(self, req):
4f264c02 1032 kwargs = {}
71aff188
YCH
1033 conn_class = self._https_conn_class
1034
4f264c02
JMF
1035 if hasattr(self, '_context'): # python > 2.6
1036 kwargs['context'] = self._context
1037 if hasattr(self, '_check_hostname'): # python 3.x
1038 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1039
1040 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1041 if socks_proxy:
1042 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1043 del req.headers['Ytdl-socks-proxy']
1044
be4a824d 1045 return self.do_open(functools.partial(
71aff188 1046 _create_http_connection, self, conn_class, True),
4f264c02 1047 req, **kwargs)
be4a824d
PH
1048
1049
a6420bf5
S
1050class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1051 def __init__(self, cookiejar=None):
1052 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1053
1054 def http_response(self, request, response):
1055 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1056 # characters in Set-Cookie HTTP header of last response (see
1057 # https://github.com/rg3/youtube-dl/issues/6769).
1058 # In order to at least prevent crashing we will percent encode Set-Cookie
1059 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1060 # if sys.version_info < (3, 0) and response.headers:
1061 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1062 # set_cookie = response.headers.get(set_cookie_header)
1063 # if set_cookie:
1064 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1065 # if set_cookie != set_cookie_escaped:
1066 # del response.headers[set_cookie_header]
1067 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1068 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1069
1070 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1071 https_response = http_response
1072
1073
46f59e89
S
1074def extract_timezone(date_str):
1075 m = re.search(
1076 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1077 date_str)
1078 if not m:
1079 timezone = datetime.timedelta()
1080 else:
1081 date_str = date_str[:-len(m.group('tz'))]
1082 if not m.group('sign'):
1083 timezone = datetime.timedelta()
1084 else:
1085 sign = 1 if m.group('sign') == '+' else -1
1086 timezone = datetime.timedelta(
1087 hours=sign * int(m.group('hours')),
1088 minutes=sign * int(m.group('minutes')))
1089 return timezone, date_str
1090
1091
08b38d54 1092def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1093 """ Return a UNIX timestamp from the given date """
1094
1095 if date_str is None:
1096 return None
1097
52c3a6e4
S
1098 date_str = re.sub(r'\.[0-9]+', '', date_str)
1099
08b38d54 1100 if timezone is None:
46f59e89
S
1101 timezone, date_str = extract_timezone(date_str)
1102
52c3a6e4
S
1103 try:
1104 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1105 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1106 return calendar.timegm(dt.timetuple())
1107 except ValueError:
1108 pass
912b38b4
PH
1109
1110
46f59e89
S
1111def date_formats(day_first=True):
1112 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1113
1114
42bdd9d0 1115def unified_strdate(date_str, day_first=True):
bf50b038 1116 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1117
1118 if date_str is None:
1119 return None
bf50b038 1120 upload_date = None
5f6a1245 1121 # Replace commas
026fcc04 1122 date_str = date_str.replace(',', ' ')
42bdd9d0 1123 # Remove AM/PM + timezone
9bb8e0a3 1124 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1125 _, date_str = extract_timezone(date_str)
42bdd9d0 1126
46f59e89 1127 for expression in date_formats(day_first):
bf50b038
JMF
1128 try:
1129 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1130 except ValueError:
bf50b038 1131 pass
42393ce2
PH
1132 if upload_date is None:
1133 timetuple = email.utils.parsedate_tz(date_str)
1134 if timetuple:
c6b9cf05
S
1135 try:
1136 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1137 except ValueError:
1138 pass
6a750402
JMF
1139 if upload_date is not None:
1140 return compat_str(upload_date)
bf50b038 1141
5f6a1245 1142
46f59e89
S
1143def unified_timestamp(date_str, day_first=True):
1144 if date_str is None:
1145 return None
1146
1147 date_str = date_str.replace(',', ' ')
1148
7dc2a74e 1149 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1150 timezone, date_str = extract_timezone(date_str)
1151
1152 # Remove AM/PM + timezone
1153 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1154
1155 for expression in date_formats(day_first):
1156 try:
7dc2a74e 1157 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1158 return calendar.timegm(dt.timetuple())
1159 except ValueError:
1160 pass
1161 timetuple = email.utils.parsedate_tz(date_str)
1162 if timetuple:
7dc2a74e 1163 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1164
1165
28e614de 1166def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1167 if url is None:
1168 return default_ext
9cb9a5df 1169 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1170 if re.match(r'^[A-Za-z0-9]+$', guess):
1171 return guess
a7aaa398
S
1172 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1173 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1174 return guess.rstrip('/')
73e79f2a 1175 else:
cbdbb766 1176 return default_ext
73e79f2a 1177
5f6a1245 1178
d4051a8e 1179def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1180 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1181
5f6a1245 1182
bd558525 1183def date_from_str(date_str):
37254abc
JMF
1184 """
1185 Return a datetime object from a string in the format YYYYMMDD or
1186 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1187 today = datetime.date.today()
f8795e10 1188 if date_str in ('now', 'today'):
37254abc 1189 return today
f8795e10
PH
1190 if date_str == 'yesterday':
1191 return today - datetime.timedelta(days=1)
ec85ded8 1192 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1193 if match is not None:
1194 sign = match.group('sign')
1195 time = int(match.group('time'))
1196 if sign == '-':
1197 time = -time
1198 unit = match.group('unit')
dfb1b146 1199 # A bad approximation?
37254abc
JMF
1200 if unit == 'month':
1201 unit = 'day'
1202 time *= 30
1203 elif unit == 'year':
1204 unit = 'day'
1205 time *= 365
1206 unit += 's'
1207 delta = datetime.timedelta(**{unit: time})
1208 return today + delta
611c1dd9 1209 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1210
1211
e63fc1be 1212def hyphenate_date(date_str):
1213 """
1214 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1215 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1216 if match is not None:
1217 return '-'.join(match.groups())
1218 else:
1219 return date_str
1220
5f6a1245 1221
bd558525
JMF
1222class DateRange(object):
1223 """Represents a time interval between two dates"""
5f6a1245 1224
bd558525
JMF
1225 def __init__(self, start=None, end=None):
1226 """start and end must be strings in the format accepted by date"""
1227 if start is not None:
1228 self.start = date_from_str(start)
1229 else:
1230 self.start = datetime.datetime.min.date()
1231 if end is not None:
1232 self.end = date_from_str(end)
1233 else:
1234 self.end = datetime.datetime.max.date()
37254abc 1235 if self.start > self.end:
bd558525 1236 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1237
bd558525
JMF
1238 @classmethod
1239 def day(cls, day):
1240 """Returns a range that only contains the given day"""
5f6a1245
JW
1241 return cls(day, day)
1242
bd558525
JMF
1243 def __contains__(self, date):
1244 """Check if the date is in the range"""
37254abc
JMF
1245 if not isinstance(date, datetime.date):
1246 date = date_from_str(date)
1247 return self.start <= date <= self.end
5f6a1245 1248
bd558525 1249 def __str__(self):
5f6a1245 1250 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1251
1252
1253def platform_name():
1254 """ Returns the platform name as a compat_str """
1255 res = platform.platform()
1256 if isinstance(res, bytes):
1257 res = res.decode(preferredencoding())
1258
1259 assert isinstance(res, compat_str)
1260 return res
c257baff
PH
1261
1262
b58ddb32
PH
1263def _windows_write_string(s, out):
1264 """ Returns True if the string was written using special methods,
1265 False if it has yet to be written out."""
1266 # Adapted from http://stackoverflow.com/a/3259271/35070
1267
1268 import ctypes
1269 import ctypes.wintypes
1270
1271 WIN_OUTPUT_IDS = {
1272 1: -11,
1273 2: -12,
1274 }
1275
a383a98a
PH
1276 try:
1277 fileno = out.fileno()
1278 except AttributeError:
1279 # If the output stream doesn't have a fileno, it's virtual
1280 return False
aa42e873
PH
1281 except io.UnsupportedOperation:
1282 # Some strange Windows pseudo files?
1283 return False
b58ddb32
PH
1284 if fileno not in WIN_OUTPUT_IDS:
1285 return False
1286
e2f89ec7 1287 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1288 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1289 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1290 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1291
e2f89ec7 1292 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1293 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1294 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1295 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1296 written = ctypes.wintypes.DWORD(0)
1297
611c1dd9 1298 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1299 FILE_TYPE_CHAR = 0x0002
1300 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1301 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1302 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1303 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1304 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1305 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1306
1307 def not_a_console(handle):
1308 if handle == INVALID_HANDLE_VALUE or handle is None:
1309 return True
8fb3ac36
PH
1310 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1311 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1312
1313 if not_a_console(h):
1314 return False
1315
d1b9c912
PH
1316 def next_nonbmp_pos(s):
1317 try:
1318 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1319 except StopIteration:
1320 return len(s)
1321
1322 while s:
1323 count = min(next_nonbmp_pos(s), 1024)
1324
b58ddb32 1325 ret = WriteConsoleW(
d1b9c912 1326 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1327 if ret == 0:
1328 raise OSError('Failed to write string')
d1b9c912
PH
1329 if not count: # We just wrote a non-BMP character
1330 assert written.value == 2
1331 s = s[1:]
1332 else:
1333 assert written.value > 0
1334 s = s[written.value:]
b58ddb32
PH
1335 return True
1336
1337
734f90bb 1338def write_string(s, out=None, encoding=None):
7459e3a2
PH
1339 if out is None:
1340 out = sys.stderr
8bf48f23 1341 assert type(s) == compat_str
7459e3a2 1342
b58ddb32
PH
1343 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1344 if _windows_write_string(s, out):
1345 return
1346
7459e3a2
PH
1347 if ('b' in getattr(out, 'mode', '') or
1348 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1349 byt = s.encode(encoding or preferredencoding(), 'ignore')
1350 out.write(byt)
1351 elif hasattr(out, 'buffer'):
1352 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1353 byt = s.encode(enc, 'ignore')
1354 out.buffer.write(byt)
1355 else:
8bf48f23 1356 out.write(s)
7459e3a2
PH
1357 out.flush()
1358
1359
48ea9cea
PH
1360def bytes_to_intlist(bs):
1361 if not bs:
1362 return []
1363 if isinstance(bs[0], int): # Python 3
1364 return list(bs)
1365 else:
1366 return [ord(c) for c in bs]
1367
c257baff 1368
cba892fa 1369def intlist_to_bytes(xs):
1370 if not xs:
1371 return b''
edaa23f8 1372 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1373
1374
c1c9a79c
PH
1375# Cross-platform file locking
1376if sys.platform == 'win32':
1377 import ctypes.wintypes
1378 import msvcrt
1379
1380 class OVERLAPPED(ctypes.Structure):
1381 _fields_ = [
1382 ('Internal', ctypes.wintypes.LPVOID),
1383 ('InternalHigh', ctypes.wintypes.LPVOID),
1384 ('Offset', ctypes.wintypes.DWORD),
1385 ('OffsetHigh', ctypes.wintypes.DWORD),
1386 ('hEvent', ctypes.wintypes.HANDLE),
1387 ]
1388
1389 kernel32 = ctypes.windll.kernel32
1390 LockFileEx = kernel32.LockFileEx
1391 LockFileEx.argtypes = [
1392 ctypes.wintypes.HANDLE, # hFile
1393 ctypes.wintypes.DWORD, # dwFlags
1394 ctypes.wintypes.DWORD, # dwReserved
1395 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1396 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1397 ctypes.POINTER(OVERLAPPED) # Overlapped
1398 ]
1399 LockFileEx.restype = ctypes.wintypes.BOOL
1400 UnlockFileEx = kernel32.UnlockFileEx
1401 UnlockFileEx.argtypes = [
1402 ctypes.wintypes.HANDLE, # hFile
1403 ctypes.wintypes.DWORD, # dwReserved
1404 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1405 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1406 ctypes.POINTER(OVERLAPPED) # Overlapped
1407 ]
1408 UnlockFileEx.restype = ctypes.wintypes.BOOL
1409 whole_low = 0xffffffff
1410 whole_high = 0x7fffffff
1411
1412 def _lock_file(f, exclusive):
1413 overlapped = OVERLAPPED()
1414 overlapped.Offset = 0
1415 overlapped.OffsetHigh = 0
1416 overlapped.hEvent = 0
1417 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1418 handle = msvcrt.get_osfhandle(f.fileno())
1419 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1420 whole_low, whole_high, f._lock_file_overlapped_p):
1421 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1422
1423 def _unlock_file(f):
1424 assert f._lock_file_overlapped_p
1425 handle = msvcrt.get_osfhandle(f.fileno())
1426 if not UnlockFileEx(handle, 0,
1427 whole_low, whole_high, f._lock_file_overlapped_p):
1428 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1429
1430else:
399a76e6
YCH
1431 # Some platforms, such as Jython, is missing fcntl
1432 try:
1433 import fcntl
c1c9a79c 1434
399a76e6
YCH
1435 def _lock_file(f, exclusive):
1436 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1437
399a76e6
YCH
1438 def _unlock_file(f):
1439 fcntl.flock(f, fcntl.LOCK_UN)
1440 except ImportError:
1441 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1442
1443 def _lock_file(f, exclusive):
1444 raise IOError(UNSUPPORTED_MSG)
1445
1446 def _unlock_file(f):
1447 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1448
1449
1450class locked_file(object):
1451 def __init__(self, filename, mode, encoding=None):
1452 assert mode in ['r', 'a', 'w']
1453 self.f = io.open(filename, mode, encoding=encoding)
1454 self.mode = mode
1455
1456 def __enter__(self):
1457 exclusive = self.mode != 'r'
1458 try:
1459 _lock_file(self.f, exclusive)
1460 except IOError:
1461 self.f.close()
1462 raise
1463 return self
1464
1465 def __exit__(self, etype, value, traceback):
1466 try:
1467 _unlock_file(self.f)
1468 finally:
1469 self.f.close()
1470
1471 def __iter__(self):
1472 return iter(self.f)
1473
1474 def write(self, *args):
1475 return self.f.write(*args)
1476
1477 def read(self, *args):
1478 return self.f.read(*args)
4eb7f1d1
JMF
1479
1480
4644ac55
S
1481def get_filesystem_encoding():
1482 encoding = sys.getfilesystemencoding()
1483 return encoding if encoding is not None else 'utf-8'
1484
1485
4eb7f1d1 1486def shell_quote(args):
a6a173c2 1487 quoted_args = []
4644ac55 1488 encoding = get_filesystem_encoding()
a6a173c2
JMF
1489 for a in args:
1490 if isinstance(a, bytes):
1491 # We may get a filename encoded with 'encodeFilename'
1492 a = a.decode(encoding)
1493 quoted_args.append(pipes.quote(a))
28e614de 1494 return ' '.join(quoted_args)
9d4660ca
PH
1495
1496
1497def smuggle_url(url, data):
1498 """ Pass additional data in a URL for internal use. """
1499
81953d1a
RA
1500 url, idata = unsmuggle_url(url, {})
1501 data.update(idata)
15707c7e 1502 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1503 {'__youtubedl_smuggle': json.dumps(data)})
1504 return url + '#' + sdata
9d4660ca
PH
1505
1506
79f82953 1507def unsmuggle_url(smug_url, default=None):
83e865a3 1508 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1509 return smug_url, default
28e614de
PH
1510 url, _, sdata = smug_url.rpartition('#')
1511 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1512 data = json.loads(jsond)
1513 return url, data
02dbf93f
PH
1514
1515
02dbf93f
PH
1516def format_bytes(bytes):
1517 if bytes is None:
28e614de 1518 return 'N/A'
02dbf93f
PH
1519 if type(bytes) is str:
1520 bytes = float(bytes)
1521 if bytes == 0.0:
1522 exponent = 0
1523 else:
1524 exponent = int(math.log(bytes, 1024.0))
28e614de 1525 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1526 converted = float(bytes) / float(1024 ** exponent)
28e614de 1527 return '%.2f%s' % (converted, suffix)
f53c966a 1528
1c088fa8 1529
fb47597b
S
1530def lookup_unit_table(unit_table, s):
1531 units_re = '|'.join(re.escape(u) for u in unit_table)
1532 m = re.match(
782b1b5b 1533 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1534 if not m:
1535 return None
1536 num_str = m.group('num').replace(',', '.')
1537 mult = unit_table[m.group('unit')]
1538 return int(float(num_str) * mult)
1539
1540
be64b5b0
PH
1541def parse_filesize(s):
1542 if s is None:
1543 return None
1544
dfb1b146 1545 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1546 # but we support those too
1547 _UNIT_TABLE = {
1548 'B': 1,
1549 'b': 1,
70852b47 1550 'bytes': 1,
be64b5b0
PH
1551 'KiB': 1024,
1552 'KB': 1000,
1553 'kB': 1024,
1554 'Kb': 1000,
13585d76 1555 'kb': 1000,
70852b47
YCH
1556 'kilobytes': 1000,
1557 'kibibytes': 1024,
be64b5b0
PH
1558 'MiB': 1024 ** 2,
1559 'MB': 1000 ** 2,
1560 'mB': 1024 ** 2,
1561 'Mb': 1000 ** 2,
13585d76 1562 'mb': 1000 ** 2,
70852b47
YCH
1563 'megabytes': 1000 ** 2,
1564 'mebibytes': 1024 ** 2,
be64b5b0
PH
1565 'GiB': 1024 ** 3,
1566 'GB': 1000 ** 3,
1567 'gB': 1024 ** 3,
1568 'Gb': 1000 ** 3,
13585d76 1569 'gb': 1000 ** 3,
70852b47
YCH
1570 'gigabytes': 1000 ** 3,
1571 'gibibytes': 1024 ** 3,
be64b5b0
PH
1572 'TiB': 1024 ** 4,
1573 'TB': 1000 ** 4,
1574 'tB': 1024 ** 4,
1575 'Tb': 1000 ** 4,
13585d76 1576 'tb': 1000 ** 4,
70852b47
YCH
1577 'terabytes': 1000 ** 4,
1578 'tebibytes': 1024 ** 4,
be64b5b0
PH
1579 'PiB': 1024 ** 5,
1580 'PB': 1000 ** 5,
1581 'pB': 1024 ** 5,
1582 'Pb': 1000 ** 5,
13585d76 1583 'pb': 1000 ** 5,
70852b47
YCH
1584 'petabytes': 1000 ** 5,
1585 'pebibytes': 1024 ** 5,
be64b5b0
PH
1586 'EiB': 1024 ** 6,
1587 'EB': 1000 ** 6,
1588 'eB': 1024 ** 6,
1589 'Eb': 1000 ** 6,
13585d76 1590 'eb': 1000 ** 6,
70852b47
YCH
1591 'exabytes': 1000 ** 6,
1592 'exbibytes': 1024 ** 6,
be64b5b0
PH
1593 'ZiB': 1024 ** 7,
1594 'ZB': 1000 ** 7,
1595 'zB': 1024 ** 7,
1596 'Zb': 1000 ** 7,
13585d76 1597 'zb': 1000 ** 7,
70852b47
YCH
1598 'zettabytes': 1000 ** 7,
1599 'zebibytes': 1024 ** 7,
be64b5b0
PH
1600 'YiB': 1024 ** 8,
1601 'YB': 1000 ** 8,
1602 'yB': 1024 ** 8,
1603 'Yb': 1000 ** 8,
13585d76 1604 'yb': 1000 ** 8,
70852b47
YCH
1605 'yottabytes': 1000 ** 8,
1606 'yobibytes': 1024 ** 8,
be64b5b0
PH
1607 }
1608
fb47597b
S
1609 return lookup_unit_table(_UNIT_TABLE, s)
1610
1611
1612def parse_count(s):
1613 if s is None:
be64b5b0
PH
1614 return None
1615
fb47597b
S
1616 s = s.strip()
1617
1618 if re.match(r'^[\d,.]+$', s):
1619 return str_to_int(s)
1620
1621 _UNIT_TABLE = {
1622 'k': 1000,
1623 'K': 1000,
1624 'm': 1000 ** 2,
1625 'M': 1000 ** 2,
1626 'kk': 1000 ** 2,
1627 'KK': 1000 ** 2,
1628 }
be64b5b0 1629
fb47597b 1630 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1631
2f7ae819 1632
a942d6cb 1633def month_by_name(name, lang='en'):
caefb1de
PH
1634 """ Return the number of a month by (locale-independently) English name """
1635
f6717dec 1636 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1637
caefb1de 1638 try:
f6717dec 1639 return month_names.index(name) + 1
7105440c
YCH
1640 except ValueError:
1641 return None
1642
1643
1644def month_by_abbreviation(abbrev):
1645 """ Return the number of a month by (locale-independently) English
1646 abbreviations """
1647
1648 try:
1649 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1650 except ValueError:
1651 return None
18258362
JMF
1652
1653
5aafe895 1654def fix_xml_ampersands(xml_str):
18258362 1655 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1656 return re.sub(
1657 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1658 '&amp;',
5aafe895 1659 xml_str)
e3946f98
PH
1660
1661
1662def setproctitle(title):
8bf48f23 1663 assert isinstance(title, compat_str)
c1c05c67
YCH
1664
1665 # ctypes in Jython is not complete
1666 # http://bugs.jython.org/issue2148
1667 if sys.platform.startswith('java'):
1668 return
1669
e3946f98 1670 try:
611c1dd9 1671 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1672 except OSError:
1673 return
6eefe533
PH
1674 title_bytes = title.encode('utf-8')
1675 buf = ctypes.create_string_buffer(len(title_bytes))
1676 buf.value = title_bytes
e3946f98 1677 try:
6eefe533 1678 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1679 except AttributeError:
1680 return # Strange libc, just skip this
d7dda168
PH
1681
1682
1683def remove_start(s, start):
46bc9b7d 1684 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1685
1686
2b9faf55 1687def remove_end(s, end):
46bc9b7d 1688 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1689
1690
31b2051e
S
1691def remove_quotes(s):
1692 if s is None or len(s) < 2:
1693 return s
1694 for quote in ('"', "'", ):
1695 if s[0] == quote and s[-1] == quote:
1696 return s[1:-1]
1697 return s
1698
1699
29eb5174 1700def url_basename(url):
9b8aaeed 1701 path = compat_urlparse.urlparse(url).path
28e614de 1702 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1703
1704
02dc0a36
S
1705def base_url(url):
1706 return re.match(r'https?://[^?#&]+/', url).group()
1707
1708
e34c3361
S
1709def urljoin(base, path):
1710 if not isinstance(path, compat_str) or not path:
1711 return None
b0c65c67 1712 if re.match(r'^(?:https?:)?//', path):
e34c3361 1713 return path
b0c65c67 1714 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
e34c3361
S
1715 return None
1716 return compat_urlparse.urljoin(base, path)
1717
1718
aa94a6d3
PH
1719class HEADRequest(compat_urllib_request.Request):
1720 def get_method(self):
611c1dd9 1721 return 'HEAD'
7217e148
PH
1722
1723
95cf60e8
S
1724class PUTRequest(compat_urllib_request.Request):
1725 def get_method(self):
1726 return 'PUT'
1727
1728
9732d77e 1729def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1730 if get_attr:
1731 if v is not None:
1732 v = getattr(v, get_attr, None)
9572013d
PH
1733 if v == '':
1734 v = None
1812afb7
S
1735 if v is None:
1736 return default
1737 try:
1738 return int(v) * invscale // scale
1739 except ValueError:
af98f8ff 1740 return default
9732d77e 1741
9572013d 1742
40a90862
JMF
1743def str_or_none(v, default=None):
1744 return default if v is None else compat_str(v)
1745
9732d77e
PH
1746
1747def str_to_int(int_str):
48d4681e 1748 """ A more relaxed version of int_or_none """
9732d77e
PH
1749 if int_str is None:
1750 return None
28e614de 1751 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1752 return int(int_str)
608d11f5
PH
1753
1754
9732d77e 1755def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1756 if v is None:
1757 return default
1758 try:
1759 return float(v) * invscale / scale
1760 except ValueError:
1761 return default
43f775e4
PH
1762
1763
b72b4431
S
1764def strip_or_none(v):
1765 return None if v is None else v.strip()
1766
1767
608d11f5 1768def parse_duration(s):
8f9312c3 1769 if not isinstance(s, compat_basestring):
608d11f5
PH
1770 return None
1771
ca7b3246
S
1772 s = s.strip()
1773
acaff495 1774 days, hours, mins, secs, ms = [None] * 5
1775 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1776 if m:
1777 days, hours, mins, secs, ms = m.groups()
1778 else:
1779 m = re.match(
1780 r'''(?ix)(?:P?T)?
8f4b58d7 1781 (?:
acaff495 1782 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1783 )?
acaff495 1784 (?:
1785 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1786 )?
1787 (?:
1788 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1789 )?
1790 (?:
1791 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1792 )?$''', s)
1793 if m:
1794 days, hours, mins, secs, ms = m.groups()
1795 else:
1796 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1797 if m:
1798 hours, mins = m.groups()
1799 else:
1800 return None
1801
1802 duration = 0
1803 if secs:
1804 duration += float(secs)
1805 if mins:
1806 duration += float(mins) * 60
1807 if hours:
1808 duration += float(hours) * 60 * 60
1809 if days:
1810 duration += float(days) * 24 * 60 * 60
1811 if ms:
1812 duration += float(ms)
1813 return duration
91d7d0b3
JMF
1814
1815
e65e4c88 1816def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1817 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1818 return (
1819 '{0}.{1}{2}'.format(name, ext, real_ext)
1820 if not expected_real_ext or real_ext[1:] == expected_real_ext
1821 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1822
1823
b3ed15b7
S
1824def replace_extension(filename, ext, expected_real_ext=None):
1825 name, real_ext = os.path.splitext(filename)
1826 return '{0}.{1}'.format(
1827 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1828 ext)
1829
1830
d70ad093
PH
1831def check_executable(exe, args=[]):
1832 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1833 args can be a list of arguments for a short output (like -version) """
1834 try:
1835 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1836 except OSError:
1837 return False
1838 return exe
b7ab0590
PH
1839
1840
95807118 1841def get_exe_version(exe, args=['--version'],
cae97f65 1842 version_re=None, unrecognized='present'):
95807118
PH
1843 """ Returns the version of the specified executable,
1844 or False if the executable is not present """
1845 try:
b64d04c1
YCH
1846 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1847 # SIGTTOU if youtube-dl is run in the background.
1848 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1849 out, _ = subprocess.Popen(
54116803 1850 [encodeArgument(exe)] + args,
00ca7552 1851 stdin=subprocess.PIPE,
95807118
PH
1852 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1853 except OSError:
1854 return False
cae97f65
PH
1855 if isinstance(out, bytes): # Python 2.x
1856 out = out.decode('ascii', 'ignore')
1857 return detect_exe_version(out, version_re, unrecognized)
1858
1859
1860def detect_exe_version(output, version_re=None, unrecognized='present'):
1861 assert isinstance(output, compat_str)
1862 if version_re is None:
1863 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1864 m = re.search(version_re, output)
95807118
PH
1865 if m:
1866 return m.group(1)
1867 else:
1868 return unrecognized
1869
1870
b7ab0590 1871class PagedList(object):
dd26ced1
PH
1872 def __len__(self):
1873 # This is only useful for tests
1874 return len(self.getslice())
1875
9c44d242
PH
1876
1877class OnDemandPagedList(PagedList):
b95dc034 1878 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1879 self._pagefunc = pagefunc
1880 self._pagesize = pagesize
b95dc034
YCH
1881 self._use_cache = use_cache
1882 if use_cache:
1883 self._cache = {}
9c44d242 1884
b7ab0590
PH
1885 def getslice(self, start=0, end=None):
1886 res = []
1887 for pagenum in itertools.count(start // self._pagesize):
1888 firstid = pagenum * self._pagesize
1889 nextfirstid = pagenum * self._pagesize + self._pagesize
1890 if start >= nextfirstid:
1891 continue
1892
b95dc034
YCH
1893 page_results = None
1894 if self._use_cache:
1895 page_results = self._cache.get(pagenum)
1896 if page_results is None:
1897 page_results = list(self._pagefunc(pagenum))
1898 if self._use_cache:
1899 self._cache[pagenum] = page_results
b7ab0590
PH
1900
1901 startv = (
1902 start % self._pagesize
1903 if firstid <= start < nextfirstid
1904 else 0)
1905
1906 endv = (
1907 ((end - 1) % self._pagesize) + 1
1908 if (end is not None and firstid <= end <= nextfirstid)
1909 else None)
1910
1911 if startv != 0 or endv is not None:
1912 page_results = page_results[startv:endv]
1913 res.extend(page_results)
1914
1915 # A little optimization - if current page is not "full", ie. does
1916 # not contain page_size videos then we can assume that this page
1917 # is the last one - there are no more ids on further pages -
1918 # i.e. no need to query again.
1919 if len(page_results) + startv < self._pagesize:
1920 break
1921
1922 # If we got the whole page, but the next page is not interesting,
1923 # break out early as well
1924 if end == nextfirstid:
1925 break
1926 return res
81c2f20b
PH
1927
1928
9c44d242
PH
1929class InAdvancePagedList(PagedList):
1930 def __init__(self, pagefunc, pagecount, pagesize):
1931 self._pagefunc = pagefunc
1932 self._pagecount = pagecount
1933 self._pagesize = pagesize
1934
1935 def getslice(self, start=0, end=None):
1936 res = []
1937 start_page = start // self._pagesize
1938 end_page = (
1939 self._pagecount if end is None else (end // self._pagesize + 1))
1940 skip_elems = start - start_page * self._pagesize
1941 only_more = None if end is None else end - start
1942 for pagenum in range(start_page, end_page):
1943 page = list(self._pagefunc(pagenum))
1944 if skip_elems:
1945 page = page[skip_elems:]
1946 skip_elems = None
1947 if only_more is not None:
1948 if len(page) < only_more:
1949 only_more -= len(page)
1950 else:
1951 page = page[:only_more]
1952 res.extend(page)
1953 break
1954 res.extend(page)
1955 return res
1956
1957
81c2f20b 1958def uppercase_escape(s):
676eb3f2 1959 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1960 return re.sub(
a612753d 1961 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1962 lambda m: unicode_escape(m.group(0))[0],
1963 s)
0fe2ff78
YCH
1964
1965
1966def lowercase_escape(s):
1967 unicode_escape = codecs.getdecoder('unicode_escape')
1968 return re.sub(
1969 r'\\u[0-9a-fA-F]{4}',
1970 lambda m: unicode_escape(m.group(0))[0],
1971 s)
b53466e1 1972
d05cfe06
S
1973
1974def escape_rfc3986(s):
1975 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1976 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1977 s = s.encode('utf-8')
ecc0c5ee 1978 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1979
1980
1981def escape_url(url):
1982 """Escape URL as suggested by RFC 3986"""
1983 url_parsed = compat_urllib_parse_urlparse(url)
1984 return url_parsed._replace(
efbed08d 1985 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1986 path=escape_rfc3986(url_parsed.path),
1987 params=escape_rfc3986(url_parsed.params),
1988 query=escape_rfc3986(url_parsed.query),
1989 fragment=escape_rfc3986(url_parsed.fragment)
1990 ).geturl()
1991
62e609ab
PH
1992
1993def read_batch_urls(batch_fd):
1994 def fixup(url):
1995 if not isinstance(url, compat_str):
1996 url = url.decode('utf-8', 'replace')
28e614de 1997 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1998 if url.startswith(BOM_UTF8):
1999 url = url[len(BOM_UTF8):]
2000 url = url.strip()
2001 if url.startswith(('#', ';', ']')):
2002 return False
2003 return url
2004
2005 with contextlib.closing(batch_fd) as fd:
2006 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2007
2008
2009def urlencode_postdata(*args, **kargs):
15707c7e 2010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2011
2012
38f9ef31 2013def update_url_query(url, query):
cacd9966
YCH
2014 if not query:
2015 return url
38f9ef31 2016 parsed_url = compat_urlparse.urlparse(url)
2017 qs = compat_parse_qs(parsed_url.query)
2018 qs.update(query)
2019 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2020 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2021
8e60dc75 2022
ed0291d1
S
2023def update_Request(req, url=None, data=None, headers={}, query={}):
2024 req_headers = req.headers.copy()
2025 req_headers.update(headers)
2026 req_data = data or req.data
2027 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2028 req_get_method = req.get_method()
2029 if req_get_method == 'HEAD':
2030 req_type = HEADRequest
2031 elif req_get_method == 'PUT':
2032 req_type = PUTRequest
2033 else:
2034 req_type = compat_urllib_request.Request
ed0291d1
S
2035 new_req = req_type(
2036 req_url, data=req_data, headers=req_headers,
2037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2038 if hasattr(req, 'timeout'):
2039 new_req.timeout = req.timeout
2040 return new_req
2041
2042
86296ad2 2043def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2044 if isinstance(key_or_keys, (list, tuple)):
2045 for key in key_or_keys:
86296ad2
S
2046 if key not in d or d[key] is None or skip_false_values and not d[key]:
2047 continue
2048 return d[key]
cbecc9b9
S
2049 return default
2050 return d.get(key_or_keys, default)
2051
2052
329ca3be
S
2053def try_get(src, getter, expected_type=None):
2054 try:
2055 v = getter(src)
2056 except (AttributeError, KeyError, TypeError, IndexError):
2057 pass
2058 else:
2059 if expected_type is None or isinstance(v, expected_type):
2060 return v
2061
2062
8e60dc75
S
2063def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2064 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2065
16392824 2066
a1a530b0
PH
2067US_RATINGS = {
2068 'G': 0,
2069 'PG': 10,
2070 'PG-13': 13,
2071 'R': 16,
2072 'NC': 18,
2073}
fac55558
PH
2074
2075
a8795327
S
2076TV_PARENTAL_GUIDELINES = {
2077 'TV-Y': 0,
2078 'TV-Y7': 7,
2079 'TV-G': 0,
2080 'TV-PG': 0,
2081 'TV-14': 14,
2082 'TV-MA': 17,
2083}
2084
2085
146c80e2 2086def parse_age_limit(s):
a8795327
S
2087 if type(s) == int:
2088 return s if 0 <= s <= 21 else None
2089 if not isinstance(s, compat_basestring):
d838b1bd 2090 return None
146c80e2 2091 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2092 if m:
2093 return int(m.group('age'))
2094 if s in US_RATINGS:
2095 return US_RATINGS[s]
2096 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2097
2098
fac55558 2099def strip_jsonp(code):
609a61e3 2100 return re.sub(
5950cb1d 2101 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2102
2103
e05f6939
PH
2104def js_to_json(code):
2105 def fix_kv(m):
e7b6d122
PH
2106 v = m.group(0)
2107 if v in ('true', 'false', 'null'):
2108 return v
bd1e4844 2109 elif v.startswith('/*') or v == ',':
2110 return ""
2111
2112 if v[0] in ("'", '"'):
2113 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2114 '"': '\\"',
bd1e4844 2115 "\\'": "'",
2116 '\\\n': '',
2117 '\\x': '\\u00',
2118 }.get(m.group(0), m.group(0)), v[1:-1])
2119
89ac4a19 2120 INTEGER_TABLE = (
e4659b45
YCH
2121 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2122 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2123 )
2124
2125 for regex, base in INTEGER_TABLE:
2126 im = re.match(regex, v)
2127 if im:
e4659b45 2128 i = int(im.group(1), base)
89ac4a19
S
2129 return '"%d":' % i if v.endswith(':') else '%d' % i
2130
e7b6d122 2131 return '"%s"' % v
e05f6939 2132
bd1e4844 2133 return re.sub(r'''(?sx)
2134 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2135 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2136 /\*.*?\*/|,(?=\s*[\]}])|
2137 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2138 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2139 [0-9]+(?=\s*:)
e05f6939 2140 ''', fix_kv, code)
e05f6939
PH
2141
2142
478c2c61
PH
2143def qualities(quality_ids):
2144 """ Get a numeric quality value out of a list of possible values """
2145 def q(qid):
2146 try:
2147 return quality_ids.index(qid)
2148 except ValueError:
2149 return -1
2150 return q
2151
acd69589
PH
2152
2153DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2154
a020a0dc
PH
2155
2156def limit_length(s, length):
2157 """ Add ellipses to overly long strings """
2158 if s is None:
2159 return None
2160 ELLIPSES = '...'
2161 if len(s) > length:
2162 return s[:length - len(ELLIPSES)] + ELLIPSES
2163 return s
48844745
PH
2164
2165
2166def version_tuple(v):
5f9b8394 2167 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2168
2169
2170def is_outdated_version(version, limit, assume_new=True):
2171 if not version:
2172 return not assume_new
2173 try:
2174 return version_tuple(version) < version_tuple(limit)
2175 except ValueError:
2176 return not assume_new
732ea2f0
PH
2177
2178
2179def ytdl_is_updateable():
2180 """ Returns if youtube-dl can be updated with -U """
2181 from zipimport import zipimporter
2182
2183 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2184
2185
2186def args_to_str(args):
2187 # Get a short string representation for a subprocess command
702ccf2d 2188 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2189
2190
9b9c5355 2191def error_to_compat_str(err):
fdae2358
S
2192 err_str = str(err)
2193 # On python 2 error byte string must be decoded with proper
2194 # encoding rather than ascii
2195 if sys.version_info[0] < 3:
2196 err_str = err_str.decode(preferredencoding())
2197 return err_str
2198
2199
c460bdd5 2200def mimetype2ext(mt):
eb9ee194
S
2201 if mt is None:
2202 return None
2203
765ac263
JMF
2204 ext = {
2205 'audio/mp4': 'm4a',
6c33d24b
YCH
2206 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2207 # it's the most popular one
2208 'audio/mpeg': 'mp3',
765ac263
JMF
2209 }.get(mt)
2210 if ext is not None:
2211 return ext
2212
c460bdd5 2213 _, _, res = mt.rpartition('/')
6562d34a 2214 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2215
2216 return {
f6861ec9 2217 '3gpp': '3gp',
cafcf657 2218 'smptett+xml': 'tt',
2219 'srt': 'srt',
2220 'ttaf+xml': 'dfxp',
a0d8d704 2221 'ttml+xml': 'ttml',
cafcf657 2222 'vtt': 'vtt',
f6861ec9 2223 'x-flv': 'flv',
a0d8d704
YCH
2224 'x-mp4-fragmented': 'mp4',
2225 'x-ms-wmv': 'wmv',
b4173f15
RA
2226 'mpegurl': 'm3u8',
2227 'x-mpegurl': 'm3u8',
2228 'vnd.apple.mpegurl': 'm3u8',
2229 'dash+xml': 'mpd',
2230 'f4m': 'f4m',
2231 'f4m+xml': 'f4m',
f164b971 2232 'hds+xml': 'f4m',
e910fe2f 2233 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2234 'quicktime': 'mov',
c460bdd5
PH
2235 }.get(res, res)
2236
2237
4f3c5e06 2238def parse_codecs(codecs_str):
2239 # http://tools.ietf.org/html/rfc6381
2240 if not codecs_str:
2241 return {}
2242 splited_codecs = list(filter(None, map(
2243 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2244 vcodec, acodec = None, None
2245 for full_codec in splited_codecs:
2246 codec = full_codec.split('.')[0]
2247 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2248 if not vcodec:
2249 vcodec = full_codec
073ac122 2250 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2251 if not acodec:
2252 acodec = full_codec
2253 else:
2254 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2255 if not vcodec and not acodec:
2256 if len(splited_codecs) == 2:
2257 return {
2258 'vcodec': vcodec,
2259 'acodec': acodec,
2260 }
2261 elif len(splited_codecs) == 1:
2262 return {
2263 'vcodec': 'none',
2264 'acodec': vcodec,
2265 }
2266 else:
2267 return {
2268 'vcodec': vcodec or 'none',
2269 'acodec': acodec or 'none',
2270 }
2271 return {}
2272
2273
2ccd1b10 2274def urlhandle_detect_ext(url_handle):
79298173 2275 getheader = url_handle.headers.get
2ccd1b10 2276
b55ee18f
PH
2277 cd = getheader('Content-Disposition')
2278 if cd:
2279 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2280 if m:
2281 e = determine_ext(m.group('filename'), default_ext=None)
2282 if e:
2283 return e
2284
c460bdd5 2285 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2286
2287
1e399778
YCH
2288def encode_data_uri(data, mime_type):
2289 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2290
2291
05900629 2292def age_restricted(content_limit, age_limit):
6ec6cb4e 2293 """ Returns True iff the content should be blocked """
05900629
PH
2294
2295 if age_limit is None: # No limit set
2296 return False
2297 if content_limit is None:
2298 return False # Content available for everyone
2299 return age_limit < content_limit
61ca9a80
PH
2300
2301
2302def is_html(first_bytes):
2303 """ Detect whether a file contains HTML by examining its first bytes. """
2304
2305 BOMS = [
2306 (b'\xef\xbb\xbf', 'utf-8'),
2307 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2308 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2309 (b'\xff\xfe', 'utf-16-le'),
2310 (b'\xfe\xff', 'utf-16-be'),
2311 ]
2312 for bom, enc in BOMS:
2313 if first_bytes.startswith(bom):
2314 s = first_bytes[len(bom):].decode(enc, 'replace')
2315 break
2316 else:
2317 s = first_bytes.decode('utf-8', 'replace')
2318
2319 return re.match(r'^\s*<', s)
a055469f
PH
2320
2321
2322def determine_protocol(info_dict):
2323 protocol = info_dict.get('protocol')
2324 if protocol is not None:
2325 return protocol
2326
2327 url = info_dict['url']
2328 if url.startswith('rtmp'):
2329 return 'rtmp'
2330 elif url.startswith('mms'):
2331 return 'mms'
2332 elif url.startswith('rtsp'):
2333 return 'rtsp'
2334
2335 ext = determine_ext(url)
2336 if ext == 'm3u8':
2337 return 'm3u8'
2338 elif ext == 'f4m':
2339 return 'f4m'
2340
2341 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2342
2343
2344def render_table(header_row, data):
2345 """ Render a list of rows, each as a list of values """
2346 table = [header_row] + data
2347 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2348 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2349 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2350
2351
2352def _match_one(filter_part, dct):
2353 COMPARISON_OPERATORS = {
2354 '<': operator.lt,
2355 '<=': operator.le,
2356 '>': operator.gt,
2357 '>=': operator.ge,
2358 '=': operator.eq,
2359 '!=': operator.ne,
2360 }
2361 operator_rex = re.compile(r'''(?x)\s*
2362 (?P<key>[a-z_]+)
2363 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2364 (?:
2365 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2366 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2367 )
2368 \s*$
2369 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2370 m = operator_rex.search(filter_part)
2371 if m:
2372 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc
S
2373 actual_value = dct.get(m.group('key'))
2374 if (m.group('strval') is not None or
2375 # If the original field is a string and matching comparisonvalue is
2376 # a number we should respect the origin of the original field
2377 # and process comparison value as a string (see
2378 # https://github.com/rg3/youtube-dl/issues/11082).
2379 actual_value is not None and m.group('intval') is not None and
2380 isinstance(actual_value, compat_str)):
347de493
PH
2381 if m.group('op') not in ('=', '!='):
2382 raise ValueError(
2383 'Operator %s does not support string values!' % m.group('op'))
e5a088dc 2384 comparison_value = m.group('strval') or m.group('intval')
347de493
PH
2385 else:
2386 try:
2387 comparison_value = int(m.group('intval'))
2388 except ValueError:
2389 comparison_value = parse_filesize(m.group('intval'))
2390 if comparison_value is None:
2391 comparison_value = parse_filesize(m.group('intval') + 'B')
2392 if comparison_value is None:
2393 raise ValueError(
2394 'Invalid integer value %r in filter part %r' % (
2395 m.group('intval'), filter_part))
347de493
PH
2396 if actual_value is None:
2397 return m.group('none_inclusive')
2398 return op(actual_value, comparison_value)
2399
2400 UNARY_OPERATORS = {
2401 '': lambda v: v is not None,
2402 '!': lambda v: v is None,
2403 }
2404 operator_rex = re.compile(r'''(?x)\s*
2405 (?P<op>%s)\s*(?P<key>[a-z_]+)
2406 \s*$
2407 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2408 m = operator_rex.search(filter_part)
2409 if m:
2410 op = UNARY_OPERATORS[m.group('op')]
2411 actual_value = dct.get(m.group('key'))
2412 return op(actual_value)
2413
2414 raise ValueError('Invalid filter part %r' % filter_part)
2415
2416
2417def match_str(filter_str, dct):
2418 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2419
2420 return all(
2421 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2422
2423
2424def match_filter_func(filter_str):
2425 def _match_func(info_dict):
2426 if match_str(filter_str, info_dict):
2427 return None
2428 else:
2429 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2430 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2431 return _match_func
91410c9b
PH
2432
2433
bf6427d2
YCH
2434def parse_dfxp_time_expr(time_expr):
2435 if not time_expr:
d631d5f9 2436 return
bf6427d2
YCH
2437
2438 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2439 if mobj:
2440 return float(mobj.group('time_offset'))
2441
db2fe38b 2442 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2443 if mobj:
db2fe38b 2444 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2445
2446
c1c924ab
YCH
2447def srt_subtitles_timecode(seconds):
2448 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2449
2450
2451def dfxp2srt(dfxp_data):
4e335771
YCH
2452 _x = functools.partial(xpath_with_ns, ns_map={
2453 'ttml': 'http://www.w3.org/ns/ttml',
2454 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2455 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2456 })
bf6427d2 2457
87de7069 2458 class TTMLPElementParser(object):
2b14cb56 2459 out = ''
bf6427d2 2460
2b14cb56 2461 def start(self, tag, attrib):
2462 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2463 self.out += '\n'
bf6427d2 2464
2b14cb56 2465 def end(self, tag):
2466 pass
bf6427d2 2467
2b14cb56 2468 def data(self, data):
2469 self.out += data
2470
2471 def close(self):
2472 return self.out.strip()
2473
2474 def parse_node(node):
2475 target = TTMLPElementParser()
2476 parser = xml.etree.ElementTree.XMLParser(target=target)
2477 parser.feed(xml.etree.ElementTree.tostring(node))
2478 return parser.close()
bf6427d2 2479
36e6f62c 2480 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2481 out = []
5bf28d78 2482 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2483
2484 if not paras:
2485 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2486
2487 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2488 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2489 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2490 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2491 if begin_time is None:
2492 continue
7dff0363 2493 if not end_time:
d631d5f9
YCH
2494 if not dur:
2495 continue
2496 end_time = begin_time + dur
bf6427d2
YCH
2497 out.append('%d\n%s --> %s\n%s\n\n' % (
2498 index,
c1c924ab
YCH
2499 srt_subtitles_timecode(begin_time),
2500 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2501 parse_node(para)))
2502
2503 return ''.join(out)
2504
2505
66e289ba
S
2506def cli_option(params, command_option, param):
2507 param = params.get(param)
98e698f1
RA
2508 if param:
2509 param = compat_str(param)
66e289ba
S
2510 return [command_option, param] if param is not None else []
2511
2512
2513def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2514 param = params.get(param)
2515 assert isinstance(param, bool)
2516 if separator:
2517 return [command_option + separator + (true_value if param else false_value)]
2518 return [command_option, true_value if param else false_value]
2519
2520
2521def cli_valueless_option(params, command_option, param, expected_value=True):
2522 param = params.get(param)
2523 return [command_option] if param == expected_value else []
2524
2525
2526def cli_configuration_args(params, param, default=[]):
2527 ex_args = params.get(param)
2528 if ex_args is None:
2529 return default
2530 assert isinstance(ex_args, list)
2531 return ex_args
2532
2533
39672624
YCH
2534class ISO639Utils(object):
2535 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2536 _lang_map = {
2537 'aa': 'aar',
2538 'ab': 'abk',
2539 'ae': 'ave',
2540 'af': 'afr',
2541 'ak': 'aka',
2542 'am': 'amh',
2543 'an': 'arg',
2544 'ar': 'ara',
2545 'as': 'asm',
2546 'av': 'ava',
2547 'ay': 'aym',
2548 'az': 'aze',
2549 'ba': 'bak',
2550 'be': 'bel',
2551 'bg': 'bul',
2552 'bh': 'bih',
2553 'bi': 'bis',
2554 'bm': 'bam',
2555 'bn': 'ben',
2556 'bo': 'bod',
2557 'br': 'bre',
2558 'bs': 'bos',
2559 'ca': 'cat',
2560 'ce': 'che',
2561 'ch': 'cha',
2562 'co': 'cos',
2563 'cr': 'cre',
2564 'cs': 'ces',
2565 'cu': 'chu',
2566 'cv': 'chv',
2567 'cy': 'cym',
2568 'da': 'dan',
2569 'de': 'deu',
2570 'dv': 'div',
2571 'dz': 'dzo',
2572 'ee': 'ewe',
2573 'el': 'ell',
2574 'en': 'eng',
2575 'eo': 'epo',
2576 'es': 'spa',
2577 'et': 'est',
2578 'eu': 'eus',
2579 'fa': 'fas',
2580 'ff': 'ful',
2581 'fi': 'fin',
2582 'fj': 'fij',
2583 'fo': 'fao',
2584 'fr': 'fra',
2585 'fy': 'fry',
2586 'ga': 'gle',
2587 'gd': 'gla',
2588 'gl': 'glg',
2589 'gn': 'grn',
2590 'gu': 'guj',
2591 'gv': 'glv',
2592 'ha': 'hau',
2593 'he': 'heb',
2594 'hi': 'hin',
2595 'ho': 'hmo',
2596 'hr': 'hrv',
2597 'ht': 'hat',
2598 'hu': 'hun',
2599 'hy': 'hye',
2600 'hz': 'her',
2601 'ia': 'ina',
2602 'id': 'ind',
2603 'ie': 'ile',
2604 'ig': 'ibo',
2605 'ii': 'iii',
2606 'ik': 'ipk',
2607 'io': 'ido',
2608 'is': 'isl',
2609 'it': 'ita',
2610 'iu': 'iku',
2611 'ja': 'jpn',
2612 'jv': 'jav',
2613 'ka': 'kat',
2614 'kg': 'kon',
2615 'ki': 'kik',
2616 'kj': 'kua',
2617 'kk': 'kaz',
2618 'kl': 'kal',
2619 'km': 'khm',
2620 'kn': 'kan',
2621 'ko': 'kor',
2622 'kr': 'kau',
2623 'ks': 'kas',
2624 'ku': 'kur',
2625 'kv': 'kom',
2626 'kw': 'cor',
2627 'ky': 'kir',
2628 'la': 'lat',
2629 'lb': 'ltz',
2630 'lg': 'lug',
2631 'li': 'lim',
2632 'ln': 'lin',
2633 'lo': 'lao',
2634 'lt': 'lit',
2635 'lu': 'lub',
2636 'lv': 'lav',
2637 'mg': 'mlg',
2638 'mh': 'mah',
2639 'mi': 'mri',
2640 'mk': 'mkd',
2641 'ml': 'mal',
2642 'mn': 'mon',
2643 'mr': 'mar',
2644 'ms': 'msa',
2645 'mt': 'mlt',
2646 'my': 'mya',
2647 'na': 'nau',
2648 'nb': 'nob',
2649 'nd': 'nde',
2650 'ne': 'nep',
2651 'ng': 'ndo',
2652 'nl': 'nld',
2653 'nn': 'nno',
2654 'no': 'nor',
2655 'nr': 'nbl',
2656 'nv': 'nav',
2657 'ny': 'nya',
2658 'oc': 'oci',
2659 'oj': 'oji',
2660 'om': 'orm',
2661 'or': 'ori',
2662 'os': 'oss',
2663 'pa': 'pan',
2664 'pi': 'pli',
2665 'pl': 'pol',
2666 'ps': 'pus',
2667 'pt': 'por',
2668 'qu': 'que',
2669 'rm': 'roh',
2670 'rn': 'run',
2671 'ro': 'ron',
2672 'ru': 'rus',
2673 'rw': 'kin',
2674 'sa': 'san',
2675 'sc': 'srd',
2676 'sd': 'snd',
2677 'se': 'sme',
2678 'sg': 'sag',
2679 'si': 'sin',
2680 'sk': 'slk',
2681 'sl': 'slv',
2682 'sm': 'smo',
2683 'sn': 'sna',
2684 'so': 'som',
2685 'sq': 'sqi',
2686 'sr': 'srp',
2687 'ss': 'ssw',
2688 'st': 'sot',
2689 'su': 'sun',
2690 'sv': 'swe',
2691 'sw': 'swa',
2692 'ta': 'tam',
2693 'te': 'tel',
2694 'tg': 'tgk',
2695 'th': 'tha',
2696 'ti': 'tir',
2697 'tk': 'tuk',
2698 'tl': 'tgl',
2699 'tn': 'tsn',
2700 'to': 'ton',
2701 'tr': 'tur',
2702 'ts': 'tso',
2703 'tt': 'tat',
2704 'tw': 'twi',
2705 'ty': 'tah',
2706 'ug': 'uig',
2707 'uk': 'ukr',
2708 'ur': 'urd',
2709 'uz': 'uzb',
2710 've': 'ven',
2711 'vi': 'vie',
2712 'vo': 'vol',
2713 'wa': 'wln',
2714 'wo': 'wol',
2715 'xh': 'xho',
2716 'yi': 'yid',
2717 'yo': 'yor',
2718 'za': 'zha',
2719 'zh': 'zho',
2720 'zu': 'zul',
2721 }
2722
2723 @classmethod
2724 def short2long(cls, code):
2725 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2726 return cls._lang_map.get(code[:2])
2727
2728 @classmethod
2729 def long2short(cls, code):
2730 """Convert language code from ISO 639-2/T to ISO 639-1"""
2731 for short_name, long_name in cls._lang_map.items():
2732 if long_name == code:
2733 return short_name
2734
2735
4eb10f66
YCH
2736class ISO3166Utils(object):
2737 # From http://data.okfn.org/data/core/country-list
2738 _country_map = {
2739 'AF': 'Afghanistan',
2740 'AX': 'Åland Islands',
2741 'AL': 'Albania',
2742 'DZ': 'Algeria',
2743 'AS': 'American Samoa',
2744 'AD': 'Andorra',
2745 'AO': 'Angola',
2746 'AI': 'Anguilla',
2747 'AQ': 'Antarctica',
2748 'AG': 'Antigua and Barbuda',
2749 'AR': 'Argentina',
2750 'AM': 'Armenia',
2751 'AW': 'Aruba',
2752 'AU': 'Australia',
2753 'AT': 'Austria',
2754 'AZ': 'Azerbaijan',
2755 'BS': 'Bahamas',
2756 'BH': 'Bahrain',
2757 'BD': 'Bangladesh',
2758 'BB': 'Barbados',
2759 'BY': 'Belarus',
2760 'BE': 'Belgium',
2761 'BZ': 'Belize',
2762 'BJ': 'Benin',
2763 'BM': 'Bermuda',
2764 'BT': 'Bhutan',
2765 'BO': 'Bolivia, Plurinational State of',
2766 'BQ': 'Bonaire, Sint Eustatius and Saba',
2767 'BA': 'Bosnia and Herzegovina',
2768 'BW': 'Botswana',
2769 'BV': 'Bouvet Island',
2770 'BR': 'Brazil',
2771 'IO': 'British Indian Ocean Territory',
2772 'BN': 'Brunei Darussalam',
2773 'BG': 'Bulgaria',
2774 'BF': 'Burkina Faso',
2775 'BI': 'Burundi',
2776 'KH': 'Cambodia',
2777 'CM': 'Cameroon',
2778 'CA': 'Canada',
2779 'CV': 'Cape Verde',
2780 'KY': 'Cayman Islands',
2781 'CF': 'Central African Republic',
2782 'TD': 'Chad',
2783 'CL': 'Chile',
2784 'CN': 'China',
2785 'CX': 'Christmas Island',
2786 'CC': 'Cocos (Keeling) Islands',
2787 'CO': 'Colombia',
2788 'KM': 'Comoros',
2789 'CG': 'Congo',
2790 'CD': 'Congo, the Democratic Republic of the',
2791 'CK': 'Cook Islands',
2792 'CR': 'Costa Rica',
2793 'CI': 'Côte d\'Ivoire',
2794 'HR': 'Croatia',
2795 'CU': 'Cuba',
2796 'CW': 'Curaçao',
2797 'CY': 'Cyprus',
2798 'CZ': 'Czech Republic',
2799 'DK': 'Denmark',
2800 'DJ': 'Djibouti',
2801 'DM': 'Dominica',
2802 'DO': 'Dominican Republic',
2803 'EC': 'Ecuador',
2804 'EG': 'Egypt',
2805 'SV': 'El Salvador',
2806 'GQ': 'Equatorial Guinea',
2807 'ER': 'Eritrea',
2808 'EE': 'Estonia',
2809 'ET': 'Ethiopia',
2810 'FK': 'Falkland Islands (Malvinas)',
2811 'FO': 'Faroe Islands',
2812 'FJ': 'Fiji',
2813 'FI': 'Finland',
2814 'FR': 'France',
2815 'GF': 'French Guiana',
2816 'PF': 'French Polynesia',
2817 'TF': 'French Southern Territories',
2818 'GA': 'Gabon',
2819 'GM': 'Gambia',
2820 'GE': 'Georgia',
2821 'DE': 'Germany',
2822 'GH': 'Ghana',
2823 'GI': 'Gibraltar',
2824 'GR': 'Greece',
2825 'GL': 'Greenland',
2826 'GD': 'Grenada',
2827 'GP': 'Guadeloupe',
2828 'GU': 'Guam',
2829 'GT': 'Guatemala',
2830 'GG': 'Guernsey',
2831 'GN': 'Guinea',
2832 'GW': 'Guinea-Bissau',
2833 'GY': 'Guyana',
2834 'HT': 'Haiti',
2835 'HM': 'Heard Island and McDonald Islands',
2836 'VA': 'Holy See (Vatican City State)',
2837 'HN': 'Honduras',
2838 'HK': 'Hong Kong',
2839 'HU': 'Hungary',
2840 'IS': 'Iceland',
2841 'IN': 'India',
2842 'ID': 'Indonesia',
2843 'IR': 'Iran, Islamic Republic of',
2844 'IQ': 'Iraq',
2845 'IE': 'Ireland',
2846 'IM': 'Isle of Man',
2847 'IL': 'Israel',
2848 'IT': 'Italy',
2849 'JM': 'Jamaica',
2850 'JP': 'Japan',
2851 'JE': 'Jersey',
2852 'JO': 'Jordan',
2853 'KZ': 'Kazakhstan',
2854 'KE': 'Kenya',
2855 'KI': 'Kiribati',
2856 'KP': 'Korea, Democratic People\'s Republic of',
2857 'KR': 'Korea, Republic of',
2858 'KW': 'Kuwait',
2859 'KG': 'Kyrgyzstan',
2860 'LA': 'Lao People\'s Democratic Republic',
2861 'LV': 'Latvia',
2862 'LB': 'Lebanon',
2863 'LS': 'Lesotho',
2864 'LR': 'Liberia',
2865 'LY': 'Libya',
2866 'LI': 'Liechtenstein',
2867 'LT': 'Lithuania',
2868 'LU': 'Luxembourg',
2869 'MO': 'Macao',
2870 'MK': 'Macedonia, the Former Yugoslav Republic of',
2871 'MG': 'Madagascar',
2872 'MW': 'Malawi',
2873 'MY': 'Malaysia',
2874 'MV': 'Maldives',
2875 'ML': 'Mali',
2876 'MT': 'Malta',
2877 'MH': 'Marshall Islands',
2878 'MQ': 'Martinique',
2879 'MR': 'Mauritania',
2880 'MU': 'Mauritius',
2881 'YT': 'Mayotte',
2882 'MX': 'Mexico',
2883 'FM': 'Micronesia, Federated States of',
2884 'MD': 'Moldova, Republic of',
2885 'MC': 'Monaco',
2886 'MN': 'Mongolia',
2887 'ME': 'Montenegro',
2888 'MS': 'Montserrat',
2889 'MA': 'Morocco',
2890 'MZ': 'Mozambique',
2891 'MM': 'Myanmar',
2892 'NA': 'Namibia',
2893 'NR': 'Nauru',
2894 'NP': 'Nepal',
2895 'NL': 'Netherlands',
2896 'NC': 'New Caledonia',
2897 'NZ': 'New Zealand',
2898 'NI': 'Nicaragua',
2899 'NE': 'Niger',
2900 'NG': 'Nigeria',
2901 'NU': 'Niue',
2902 'NF': 'Norfolk Island',
2903 'MP': 'Northern Mariana Islands',
2904 'NO': 'Norway',
2905 'OM': 'Oman',
2906 'PK': 'Pakistan',
2907 'PW': 'Palau',
2908 'PS': 'Palestine, State of',
2909 'PA': 'Panama',
2910 'PG': 'Papua New Guinea',
2911 'PY': 'Paraguay',
2912 'PE': 'Peru',
2913 'PH': 'Philippines',
2914 'PN': 'Pitcairn',
2915 'PL': 'Poland',
2916 'PT': 'Portugal',
2917 'PR': 'Puerto Rico',
2918 'QA': 'Qatar',
2919 'RE': 'Réunion',
2920 'RO': 'Romania',
2921 'RU': 'Russian Federation',
2922 'RW': 'Rwanda',
2923 'BL': 'Saint Barthélemy',
2924 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2925 'KN': 'Saint Kitts and Nevis',
2926 'LC': 'Saint Lucia',
2927 'MF': 'Saint Martin (French part)',
2928 'PM': 'Saint Pierre and Miquelon',
2929 'VC': 'Saint Vincent and the Grenadines',
2930 'WS': 'Samoa',
2931 'SM': 'San Marino',
2932 'ST': 'Sao Tome and Principe',
2933 'SA': 'Saudi Arabia',
2934 'SN': 'Senegal',
2935 'RS': 'Serbia',
2936 'SC': 'Seychelles',
2937 'SL': 'Sierra Leone',
2938 'SG': 'Singapore',
2939 'SX': 'Sint Maarten (Dutch part)',
2940 'SK': 'Slovakia',
2941 'SI': 'Slovenia',
2942 'SB': 'Solomon Islands',
2943 'SO': 'Somalia',
2944 'ZA': 'South Africa',
2945 'GS': 'South Georgia and the South Sandwich Islands',
2946 'SS': 'South Sudan',
2947 'ES': 'Spain',
2948 'LK': 'Sri Lanka',
2949 'SD': 'Sudan',
2950 'SR': 'Suriname',
2951 'SJ': 'Svalbard and Jan Mayen',
2952 'SZ': 'Swaziland',
2953 'SE': 'Sweden',
2954 'CH': 'Switzerland',
2955 'SY': 'Syrian Arab Republic',
2956 'TW': 'Taiwan, Province of China',
2957 'TJ': 'Tajikistan',
2958 'TZ': 'Tanzania, United Republic of',
2959 'TH': 'Thailand',
2960 'TL': 'Timor-Leste',
2961 'TG': 'Togo',
2962 'TK': 'Tokelau',
2963 'TO': 'Tonga',
2964 'TT': 'Trinidad and Tobago',
2965 'TN': 'Tunisia',
2966 'TR': 'Turkey',
2967 'TM': 'Turkmenistan',
2968 'TC': 'Turks and Caicos Islands',
2969 'TV': 'Tuvalu',
2970 'UG': 'Uganda',
2971 'UA': 'Ukraine',
2972 'AE': 'United Arab Emirates',
2973 'GB': 'United Kingdom',
2974 'US': 'United States',
2975 'UM': 'United States Minor Outlying Islands',
2976 'UY': 'Uruguay',
2977 'UZ': 'Uzbekistan',
2978 'VU': 'Vanuatu',
2979 'VE': 'Venezuela, Bolivarian Republic of',
2980 'VN': 'Viet Nam',
2981 'VG': 'Virgin Islands, British',
2982 'VI': 'Virgin Islands, U.S.',
2983 'WF': 'Wallis and Futuna',
2984 'EH': 'Western Sahara',
2985 'YE': 'Yemen',
2986 'ZM': 'Zambia',
2987 'ZW': 'Zimbabwe',
2988 }
2989
2990 @classmethod
2991 def short2full(cls, code):
2992 """Convert an ISO 3166-2 country code to the corresponding full name"""
2993 return cls._country_map.get(code.upper())
2994
2995
91410c9b 2996class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2997 def __init__(self, proxies=None):
2998 # Set default handlers
2999 for type in ('http', 'https'):
3000 setattr(self, '%s_open' % type,
3001 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3002 meth(r, proxy, type))
3003 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3004
91410c9b 3005 def proxy_open(self, req, proxy, type):
2461f79d 3006 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3007 if req_proxy is not None:
3008 proxy = req_proxy
2461f79d
PH
3009 del req.headers['Ytdl-request-proxy']
3010
3011 if proxy == '__noproxy__':
3012 return None # No Proxy
51fb4995 3013 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3014 req.add_header('Ytdl-socks-proxy', proxy)
3015 # youtube-dl's http/https handlers do wrapping the socket with socks
3016 return None
91410c9b
PH
3017 return compat_urllib_request.ProxyHandler.proxy_open(
3018 self, req, proxy, type)
5bc880b9
YCH
3019
3020
3021def ohdave_rsa_encrypt(data, exponent, modulus):
3022 '''
3023 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3024
3025 Input:
3026 data: data to encrypt, bytes-like object
3027 exponent, modulus: parameter e and N of RSA algorithm, both integer
3028 Output: hex string of encrypted data
3029
3030 Limitation: supports one block encryption only
3031 '''
3032
3033 payload = int(binascii.hexlify(data[::-1]), 16)
3034 encrypted = pow(payload, exponent, modulus)
3035 return '%x' % encrypted
81bdc8fd
YCH
3036
3037
5eb6bdce 3038def encode_base_n(num, n, table=None):
59f898b7 3039 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3040 if not table:
3041 table = FULL_TABLE[:n]
3042
5eb6bdce
YCH
3043 if n > len(table):
3044 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3045
3046 if num == 0:
3047 return table[0]
3048
81bdc8fd
YCH
3049 ret = ''
3050 while num:
3051 ret = table[num % n] + ret
3052 num = num // n
3053 return ret
f52354a8
YCH
3054
3055
3056def decode_packed_codes(code):
06b3fe29 3057 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3058 obfucasted_code, base, count, symbols = mobj.groups()
3059 base = int(base)
3060 count = int(count)
3061 symbols = symbols.split('|')
3062 symbol_table = {}
3063
3064 while count:
3065 count -= 1
5eb6bdce 3066 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3067 symbol_table[base_n_count] = symbols[count] or base_n_count
3068
3069 return re.sub(
3070 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3071 obfucasted_code)
e154c651 3072
3073
3074def parse_m3u8_attributes(attrib):
3075 info = {}
3076 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3077 if val.startswith('"'):
3078 val = val[1:-1]
3079 info[key] = val
3080 return info
1143535d
YCH
3081
3082
3083def urshift(val, n):
3084 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3085
3086
3087# Based on png2str() written by @gdkchan and improved by @yokrysty
3088# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3089def decode_png(png_data):
3090 # Reference: https://www.w3.org/TR/PNG/
3091 header = png_data[8:]
3092
3093 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3094 raise IOError('Not a valid PNG file.')
3095
3096 int_map = {1: '>B', 2: '>H', 4: '>I'}
3097 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3098
3099 chunks = []
3100
3101 while header:
3102 length = unpack_integer(header[:4])
3103 header = header[4:]
3104
3105 chunk_type = header[:4]
3106 header = header[4:]
3107
3108 chunk_data = header[:length]
3109 header = header[length:]
3110
3111 header = header[4:] # Skip CRC
3112
3113 chunks.append({
3114 'type': chunk_type,
3115 'length': length,
3116 'data': chunk_data
3117 })
3118
3119 ihdr = chunks[0]['data']
3120
3121 width = unpack_integer(ihdr[:4])
3122 height = unpack_integer(ihdr[4:8])
3123
3124 idat = b''
3125
3126 for chunk in chunks:
3127 if chunk['type'] == b'IDAT':
3128 idat += chunk['data']
3129
3130 if not idat:
3131 raise IOError('Unable to read PNG data.')
3132
3133 decompressed_data = bytearray(zlib.decompress(idat))
3134
3135 stride = width * 3
3136 pixels = []
3137
3138 def _get_pixel(idx):
3139 x = idx % stride
3140 y = idx // stride
3141 return pixels[y][x]
3142
3143 for y in range(height):
3144 basePos = y * (1 + stride)
3145 filter_type = decompressed_data[basePos]
3146
3147 current_row = []
3148
3149 pixels.append(current_row)
3150
3151 for x in range(stride):
3152 color = decompressed_data[1 + basePos + x]
3153 basex = y * stride + x
3154 left = 0
3155 up = 0
3156
3157 if x > 2:
3158 left = _get_pixel(basex - 3)
3159 if y > 0:
3160 up = _get_pixel(basex - stride)
3161
3162 if filter_type == 1: # Sub
3163 color = (color + left) & 0xff
3164 elif filter_type == 2: # Up
3165 color = (color + up) & 0xff
3166 elif filter_type == 3: # Average
3167 color = (color + ((left + up) >> 1)) & 0xff
3168 elif filter_type == 4: # Paeth
3169 a = left
3170 b = up
3171 c = 0
3172
3173 if x > 2 and y > 0:
3174 c = _get_pixel(basex - stride - 3)
3175
3176 p = a + b - c
3177
3178 pa = abs(p - a)
3179 pb = abs(p - b)
3180 pc = abs(p - c)
3181
3182 if pa <= pb and pa <= pc:
3183 color = (color + a) & 0xff
3184 elif pb <= pc:
3185 color = (color + b) & 0xff
3186 else:
3187 color = (color + c) & 0xff
3188
3189 current_row.append(color)
3190
3191 return width, height, pixels
efa97bdc
YCH
3192
3193
3194def write_xattr(path, key, value):
3195 # This mess below finds the best xattr tool for the job
3196 try:
3197 # try the pyxattr module...
3198 import xattr
3199
53a7e3d2
YCH
3200 if hasattr(xattr, 'set'): # pyxattr
3201 # Unicode arguments are not supported in python-pyxattr until
3202 # version 0.5.0
3203 # See https://github.com/rg3/youtube-dl/issues/5498
3204 pyxattr_required_version = '0.5.0'
3205 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3206 # TODO: fallback to CLI tools
3207 raise XAttrUnavailableError(
3208 'python-pyxattr is detected but is too old. '
3209 'youtube-dl requires %s or above while your version is %s. '
3210 'Falling back to other xattr implementations' % (
3211 pyxattr_required_version, xattr.__version__))
3212
3213 setxattr = xattr.set
3214 else: # xattr
3215 setxattr = xattr.setxattr
efa97bdc
YCH
3216
3217 try:
53a7e3d2 3218 setxattr(path, key, value)
efa97bdc
YCH
3219 except EnvironmentError as e:
3220 raise XAttrMetadataError(e.errno, e.strerror)
3221
3222 except ImportError:
3223 if compat_os_name == 'nt':
3224 # Write xattrs to NTFS Alternate Data Streams:
3225 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3226 assert ':' not in key
3227 assert os.path.exists(path)
3228
3229 ads_fn = path + ':' + key
3230 try:
3231 with open(ads_fn, 'wb') as f:
3232 f.write(value)
3233 except EnvironmentError as e:
3234 raise XAttrMetadataError(e.errno, e.strerror)
3235 else:
3236 user_has_setfattr = check_executable('setfattr', ['--version'])
3237 user_has_xattr = check_executable('xattr', ['-h'])
3238
3239 if user_has_setfattr or user_has_xattr:
3240
3241 value = value.decode('utf-8')
3242 if user_has_setfattr:
3243 executable = 'setfattr'
3244 opts = ['-n', key, '-v', value]
3245 elif user_has_xattr:
3246 executable = 'xattr'
3247 opts = ['-w', key, value]
3248
3249 cmd = ([encodeFilename(executable, True)] +
3250 [encodeArgument(o) for o in opts] +
3251 [encodeFilename(path, True)])
3252
3253 try:
3254 p = subprocess.Popen(
3255 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3256 except EnvironmentError as e:
3257 raise XAttrMetadataError(e.errno, e.strerror)
3258 stdout, stderr = p.communicate()
3259 stderr = stderr.decode('utf-8', 'replace')
3260 if p.returncode != 0:
3261 raise XAttrMetadataError(p.returncode, stderr)
3262
3263 else:
3264 # On Unix, and can't find pyxattr, setfattr, or xattr.
3265 if sys.platform.startswith('linux'):
3266 raise XAttrUnavailableError(
3267 "Couldn't find a tool to set the xattrs. "
3268 "Install either the python 'pyxattr' or 'xattr' "
3269 "modules, or the GNU 'attr' package "
3270 "(which contains the 'setfattr' tool).")
3271 else:
3272 raise XAttrUnavailableError(
3273 "Couldn't find a tool to set the xattrs. "
3274 "Install either the python 'xattr' module, "
3275 "or the 'xattr' binary.")