]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Handle single-line comments in js_to_json
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
fb37eb25
S
89USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91}
92
93
bf42a990
S
94NO_DEFAULT = object()
95
7105440c
YCH
96ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
f6717dec
S
100MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
3e4185c3
S
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 105}
a942d6cb 106
a7aaa398
S
107KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
c587cbb7 122# needed for sanitizing filenames in restricted mode
c8827027 123ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 126
46f59e89
S
127DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
cb655f34
S
131 '%B %dst %Y',
132 '%B %dnd %Y',
133 '%B %dth %Y',
46f59e89 134 '%b %d %Y',
cb655f34
S
135 '%b %dst %Y',
136 '%b %dnd %Y',
137 '%b %dth %Y',
46f59e89
S
138 '%b %dst %Y %I:%M',
139 '%b %dnd %Y %I:%M',
140 '%b %dth %Y %I:%M',
141 '%Y %m %d',
142 '%Y-%m-%d',
143 '%Y/%m/%d',
81c13222 144 '%Y/%m/%d %H:%M',
46f59e89 145 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 146 '%Y-%m-%d %H:%M',
46f59e89
S
147 '%Y-%m-%d %H:%M:%S',
148 '%Y-%m-%d %H:%M:%S.%f',
149 '%d.%m.%Y %H:%M',
150 '%d.%m.%Y %H.%M',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
154 '%Y-%m-%dT%H:%M:%S',
155 '%Y-%m-%dT%H:%M:%S.%f',
156 '%Y-%m-%dT%H:%M',
c6eed6b8
S
157 '%b %d %Y at %H:%M',
158 '%b %d %Y at %H:%M:%S',
46f59e89
S
159)
160
161DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
162DATE_FORMATS_DAY_FIRST.extend([
163 '%d-%m-%Y',
164 '%d.%m.%Y',
165 '%d.%m.%y',
166 '%d/%m/%Y',
167 '%d/%m/%y',
168 '%d/%m/%Y %H:%M:%S',
169])
170
171DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
172DATE_FORMATS_MONTH_FIRST.extend([
173 '%m-%d-%Y',
174 '%m.%d.%Y',
175 '%m/%d/%Y',
176 '%m/%d/%y',
177 '%m/%d/%Y %H:%M:%S',
178])
179
06b3fe29
S
180PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
181
7105440c 182
d77c3dfd 183def preferredencoding():
59ae15a5 184 """Get preferred encoding.
d77c3dfd 185
59ae15a5
PH
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
188 """
189 try:
190 pref = locale.getpreferredencoding()
28e614de 191 'TEST'.encode(pref)
70a1165b 192 except Exception:
59ae15a5 193 pref = 'UTF-8'
bae611f2 194
59ae15a5 195 return pref
d77c3dfd 196
f4bfd65f 197
181c8655 198def write_json_file(obj, fn):
1394646a 199 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 200
92120217 201 fn = encodeFilename(fn)
61ee5aeb 202 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
203 encoding = get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename = lambda f: os.path.basename(fn).decode(encoding)
208 # the same for os.path.dirname
209 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
210 else:
211 path_basename = os.path.basename
212 path_dirname = os.path.dirname
213
73159f99
S
214 args = {
215 'suffix': '.tmp',
ec5f6016
JMF
216 'prefix': path_basename(fn) + '.',
217 'dir': path_dirname(fn),
73159f99
S
218 'delete': False,
219 }
220
181c8655
PH
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3, 0):
73159f99 224 args['mode'] = 'wb'
181c8655 225 else:
73159f99
S
226 args.update({
227 'mode': 'w',
228 'encoding': 'utf-8',
229 })
230
c86b6142 231 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
232
233 try:
234 with tf:
235 json.dump(obj, tf)
1394646a
IK
236 if sys.platform == 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
239 try:
240 os.unlink(fn)
241 except OSError:
242 pass
181c8655 243 os.rename(tf.name, fn)
70a1165b 244 except Exception:
181c8655
PH
245 try:
246 os.remove(tf.name)
247 except OSError:
248 pass
249 raise
250
251
252if sys.version_info >= (2, 7):
ee114368 253 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 254 """ Find the xpath xpath[@key=val] """
5d2354f1 255 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 256 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
257 return node.find(expr)
258else:
ee114368 259 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 260 for f in node.findall(compat_xpath(xpath)):
ee114368
S
261 if key not in f.attrib:
262 continue
263 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
264 return f
265 return None
266
d7e66d39
JMF
267# On python2.6 the xml.etree.ElementTree.Element methods don't support
268# the namespace parameter
5f6a1245
JW
269
270
d7e66d39
JMF
271def xpath_with_ns(path, ns_map):
272 components = [c.split(':') for c in path.split('/')]
273 replaced = []
274 for c in components:
275 if len(c) == 1:
276 replaced.append(c[0])
277 else:
278 ns, tag = c
279 replaced.append('{%s}%s' % (ns_map[ns], tag))
280 return '/'.join(replaced)
281
d77c3dfd 282
a41fb80c 283def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 284 def _find_xpath(xpath):
810c10ba 285 return node.find(compat_xpath(xpath))
578c0745
S
286
287 if isinstance(xpath, (str, compat_str)):
288 n = _find_xpath(xpath)
289 else:
290 for xp in xpath:
291 n = _find_xpath(xp)
292 if n is not None:
293 break
d74bebd5 294
8e636da4 295 if n is None:
bf42a990
S
296 if default is not NO_DEFAULT:
297 return default
298 elif fatal:
bf0ff932
PH
299 name = xpath if name is None else name
300 raise ExtractorError('Could not find XML element %s' % name)
301 else:
302 return None
a41fb80c
S
303 return n
304
305
306def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
307 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
308 if n is None or n == default:
309 return n
310 if n.text is None:
311 if default is not NO_DEFAULT:
312 return default
313 elif fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name)
316 else:
317 return None
318 return n.text
a41fb80c
S
319
320
321def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
322 n = find_xpath_attr(node, xpath, key)
323 if n is None:
324 if default is not NO_DEFAULT:
325 return default
326 elif fatal:
327 name = '%s[@%s]' % (xpath, key) if name is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name)
329 else:
330 return None
331 return n.attrib[key]
bf0ff932
PH
332
333
9e6dd238 334def get_element_by_id(id, html):
43e8fafd 335 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 336 return get_element_by_attribute('id', id, html)
43e8fafd 337
12ea2f30 338
84c237fb
YCH
339def get_element_by_class(class_name, html):
340 return get_element_by_attribute(
341 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
342 html, escape_value=False)
343
344
345def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 346 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 347
84c237fb
YCH
348 value = re.escape(value) if escape_value else value
349
38285056
PH
350 m = re.search(r'''(?xs)
351 <([a-zA-Z0-9:._-]+)
abc97b5e 352 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 353 \s+%s=['"]?%s['"]?
abc97b5e 354 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
355 \s*>
356 (?P<content>.*?)
357 </\1>
84c237fb 358 ''' % (re.escape(attribute), value), html)
38285056
PH
359
360 if not m:
361 return None
362 res = m.group('content')
363
364 if res.startswith('"') or res.startswith("'"):
365 res = res[1:-1]
a921f407 366
38285056 367 return unescapeHTML(res)
a921f407 368
c5229f39 369
8bb56eee
BF
370class HTMLAttributeParser(compat_HTMLParser):
371 """Trivial HTML parser to gather the attributes for a single element"""
372 def __init__(self):
c5229f39 373 self.attrs = {}
8bb56eee
BF
374 compat_HTMLParser.__init__(self)
375
376 def handle_starttag(self, tag, attrs):
377 self.attrs = dict(attrs)
378
c5229f39 379
8bb56eee
BF
380def extract_attributes(html_element):
381 """Given a string for an HTML element such as
382 <el
383 a="foo" B="bar" c="&98;az" d=boz
384 empty= noval entity="&amp;"
385 sq='"' dq="'"
386 >
387 Decode and return a dictionary of attributes.
388 {
389 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
390 'empty': '', 'noval': None, 'entity': '&',
391 'sq': '"', 'dq': '\''
392 }.
393 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
394 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
395 """
396 parser = HTMLAttributeParser()
397 parser.feed(html_element)
398 parser.close()
399 return parser.attrs
9e6dd238 400
c5229f39 401
9e6dd238 402def clean_html(html):
59ae15a5 403 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
404
405 if html is None: # Convenience for sanitizing descriptions etc.
406 return html
407
59ae15a5
PH
408 # Newline vs <br />
409 html = html.replace('\n', ' ')
6b3aef80
FV
410 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
411 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
412 # Strip html tags
413 html = re.sub('<.*?>', '', html)
414 # Replace html entities
415 html = unescapeHTML(html)
7decf895 416 return html.strip()
9e6dd238
FV
417
418
d77c3dfd 419def sanitize_open(filename, open_mode):
59ae15a5
PH
420 """Try to open the given filename, and slightly tweak it if this fails.
421
422 Attempts to open the given filename. If this fails, it tries to change
423 the filename slightly, step by step, until it's either able to open it
424 or it fails and raises a final exception, like the standard open()
425 function.
426
427 It returns the tuple (stream, definitive_file_name).
428 """
429 try:
28e614de 430 if filename == '-':
59ae15a5
PH
431 if sys.platform == 'win32':
432 import msvcrt
433 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 434 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
435 stream = open(encodeFilename(filename), open_mode)
436 return (stream, filename)
437 except (IOError, OSError) as err:
f45c185f
PH
438 if err.errno in (errno.EACCES,):
439 raise
59ae15a5 440
f45c185f 441 # In case of error, try to remove win32 forbidden chars
d55de57b 442 alt_filename = sanitize_path(filename)
f45c185f
PH
443 if alt_filename == filename:
444 raise
445 else:
446 # An exception here should be caught in the caller
d55de57b 447 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 448 return (stream, alt_filename)
d77c3dfd
FV
449
450
451def timeconvert(timestr):
59ae15a5
PH
452 """Convert RFC 2822 defined time string into system timestamp"""
453 timestamp = None
454 timetuple = email.utils.parsedate_tz(timestr)
455 if timetuple is not None:
456 timestamp = email.utils.mktime_tz(timetuple)
457 return timestamp
1c469a94 458
5f6a1245 459
796173d0 460def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
461 """Sanitizes a string so it could be used as part of a filename.
462 If restricted is set, use a stricter subset of allowed characters.
796173d0 463 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
464 """
465 def replace_insane(char):
c587cbb7
AT
466 if restricted and char in ACCENT_CHARS:
467 return ACCENT_CHARS[char]
59ae15a5
PH
468 if char == '?' or ord(char) < 32 or ord(char) == 127:
469 return ''
470 elif char == '"':
471 return '' if restricted else '\''
472 elif char == ':':
473 return '_-' if restricted else ' -'
474 elif char in '\\/|*<>':
475 return '_'
627dcfff 476 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
477 return '_'
478 if restricted and ord(char) > 127:
479 return '_'
480 return char
481
2aeb06d6
PH
482 # Handle timestamps
483 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 484 result = ''.join(map(replace_insane, s))
796173d0
PH
485 if not is_id:
486 while '__' in result:
487 result = result.replace('__', '_')
488 result = result.strip('_')
489 # Common case of "Foreign band name - English song title"
490 if restricted and result.startswith('-_'):
491 result = result[2:]
5a42414b
PH
492 if result.startswith('-'):
493 result = '_' + result[len('-'):]
a7440261 494 result = result.lstrip('.')
796173d0
PH
495 if not result:
496 result = '_'
59ae15a5 497 return result
d77c3dfd 498
5f6a1245 499
a2aaf4db
S
500def sanitize_path(s):
501 """Sanitizes and normalizes path on Windows"""
502 if sys.platform != 'win32':
503 return s
be531ef1
S
504 drive_or_unc, _ = os.path.splitdrive(s)
505 if sys.version_info < (2, 7) and not drive_or_unc:
506 drive_or_unc, _ = os.path.splitunc(s)
507 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
508 if drive_or_unc:
a2aaf4db
S
509 norm_path.pop(0)
510 sanitized_path = [
ec85ded8 511 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 512 for path_part in norm_path]
be531ef1
S
513 if drive_or_unc:
514 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
515 return os.path.join(*sanitized_path)
516
517
67dda517
S
518# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
519# unwanted failures due to missing protocol
17bcc626
S
520def sanitize_url(url):
521 return 'http:%s' % url if url.startswith('//') else url
522
523
67dda517 524def sanitized_Request(url, *args, **kwargs):
17bcc626 525 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
526
527
d77c3dfd 528def orderedSet(iterable):
59ae15a5
PH
529 """ Remove all duplicates from the input iterable """
530 res = []
531 for el in iterable:
532 if el not in res:
533 res.append(el)
534 return res
d77c3dfd 535
912b38b4 536
55b2f099 537def _htmlentity_transform(entity_with_semicolon):
4e408e47 538 """Transforms an HTML entity to a character."""
55b2f099
YCH
539 entity = entity_with_semicolon[:-1]
540
4e408e47
PH
541 # Known non-numeric HTML entity
542 if entity in compat_html_entities.name2codepoint:
543 return compat_chr(compat_html_entities.name2codepoint[entity])
544
55b2f099
YCH
545 # TODO: HTML5 allows entities without a semicolon. For example,
546 # '&Eacuteric' should be decoded as 'Éric'.
547 if entity_with_semicolon in compat_html_entities_html5:
548 return compat_html_entities_html5[entity_with_semicolon]
549
91757b0f 550 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
551 if mobj is not None:
552 numstr = mobj.group(1)
28e614de 553 if numstr.startswith('x'):
4e408e47 554 base = 16
28e614de 555 numstr = '0%s' % numstr
4e408e47
PH
556 else:
557 base = 10
7aefc49c
S
558 # See https://github.com/rg3/youtube-dl/issues/7518
559 try:
560 return compat_chr(int(numstr, base))
561 except ValueError:
562 pass
4e408e47
PH
563
564 # Unknown entity in name, return its literal representation
7a3f0c00 565 return '&%s;' % entity
4e408e47
PH
566
567
d77c3dfd 568def unescapeHTML(s):
912b38b4
PH
569 if s is None:
570 return None
571 assert type(s) == compat_str
d77c3dfd 572
4e408e47 573 return re.sub(
55b2f099 574 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 575
8bf48f23 576
aa49acd1
S
577def get_subprocess_encoding():
578 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 # For subprocess calls, encode with locale encoding
580 # Refer to http://stackoverflow.com/a/9951851/35070
581 encoding = preferredencoding()
582 else:
583 encoding = sys.getfilesystemencoding()
584 if encoding is None:
585 encoding = 'utf-8'
586 return encoding
587
588
8bf48f23 589def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
590 """
591 @param s The name of the file
592 """
d77c3dfd 593
8bf48f23 594 assert type(s) == compat_str
d77c3dfd 595
59ae15a5
PH
596 # Python 3 has a Unicode API
597 if sys.version_info >= (3, 0):
598 return s
0f00efed 599
aa49acd1
S
600 # Pass '' directly to use Unicode APIs on Windows 2000 and up
601 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
602 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
603 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 return s
605
8ee239e9
YCH
606 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
607 if sys.platform.startswith('java'):
608 return s
609
aa49acd1
S
610 return s.encode(get_subprocess_encoding(), 'ignore')
611
612
613def decodeFilename(b, for_subprocess=False):
614
615 if sys.version_info >= (3, 0):
616 return b
617
618 if not isinstance(b, bytes):
619 return b
620
621 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 622
f07b74fc
PH
623
624def encodeArgument(s):
625 if not isinstance(s, compat_str):
626 # Legacy code that uses byte strings
627 # Uncomment the following line after fixing all post processors
7af808a5 628 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
629 s = s.decode('ascii')
630 return encodeFilename(s, True)
631
632
aa49acd1
S
633def decodeArgument(b):
634 return decodeFilename(b, True)
635
636
8271226a
PH
637def decodeOption(optval):
638 if optval is None:
639 return optval
640 if isinstance(optval, bytes):
641 optval = optval.decode(preferredencoding())
642
643 assert isinstance(optval, compat_str)
644 return optval
1c256f70 645
5f6a1245 646
4539dd30
PH
647def formatSeconds(secs):
648 if secs > 3600:
649 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
650 elif secs > 60:
651 return '%d:%02d' % (secs // 60, secs % 60)
652 else:
653 return '%d' % secs
654
a0ddb8a2 655
be4a824d
PH
656def make_HTTPS_handler(params, **kwargs):
657 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 658 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 659 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 660 if opts_no_check_certificate:
be5f2c19 661 context.check_hostname = False
0db261ba 662 context.verify_mode = ssl.CERT_NONE
a2366922 663 try:
be4a824d 664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
665 except TypeError:
666 # Python 2.7.8
667 # (create_default_context present but HTTPSHandler has no context=)
668 pass
669
670 if sys.version_info < (3, 2):
d7932313 671 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 672 else: # Python < 3.4
d7932313 673 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 674 context.verify_mode = (ssl.CERT_NONE
dca08720 675 if opts_no_check_certificate
ea6d901e 676 else ssl.CERT_REQUIRED)
303b479e 677 context.set_default_verify_paths()
be4a824d 678 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 679
732ea2f0 680
08f2a92c
JMF
681def bug_reports_message():
682 if ytdl_is_updateable():
683 update_cmd = 'type youtube-dl -U to update'
684 else:
685 update_cmd = 'see https://yt-dl.org/update on how to update'
686 msg = '; please report this issue on https://yt-dl.org/bug .'
687 msg += ' Make sure you are using the latest version; %s.' % update_cmd
688 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
689 return msg
690
691
1c256f70
PH
692class ExtractorError(Exception):
693 """Error during info extraction."""
5f6a1245 694
d11271dd 695 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
696 """ tb, if given, is the original traceback (so that it can be printed out).
697 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
698 """
699
700 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
701 expected = True
d11271dd
PH
702 if video_id is not None:
703 msg = video_id + ': ' + msg
410f3e73 704 if cause:
28e614de 705 msg += ' (caused by %r)' % cause
9a82b238 706 if not expected:
08f2a92c 707 msg += bug_reports_message()
1c256f70 708 super(ExtractorError, self).__init__(msg)
d5979c5d 709
1c256f70 710 self.traceback = tb
8cc83b8d 711 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 712 self.cause = cause
d11271dd 713 self.video_id = video_id
1c256f70 714
01951dda
PH
715 def format_traceback(self):
716 if self.traceback is None:
717 return None
28e614de 718 return ''.join(traceback.format_tb(self.traceback))
01951dda 719
1c256f70 720
416c7fcb
PH
721class UnsupportedError(ExtractorError):
722 def __init__(self, url):
723 super(UnsupportedError, self).__init__(
724 'Unsupported URL: %s' % url, expected=True)
725 self.url = url
726
727
55b3e45b
JMF
728class RegexNotFoundError(ExtractorError):
729 """Error when a regex didn't match"""
730 pass
731
732
d77c3dfd 733class DownloadError(Exception):
59ae15a5 734 """Download Error exception.
d77c3dfd 735
59ae15a5
PH
736 This exception may be thrown by FileDownloader objects if they are not
737 configured to continue on errors. They will contain the appropriate
738 error message.
739 """
5f6a1245 740
8cc83b8d
FV
741 def __init__(self, msg, exc_info=None):
742 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
743 super(DownloadError, self).__init__(msg)
744 self.exc_info = exc_info
d77c3dfd
FV
745
746
747class SameFileError(Exception):
59ae15a5 748 """Same File exception.
d77c3dfd 749
59ae15a5
PH
750 This exception will be thrown by FileDownloader objects if they detect
751 multiple files would have to be downloaded to the same file on disk.
752 """
753 pass
d77c3dfd
FV
754
755
756class PostProcessingError(Exception):
59ae15a5 757 """Post Processing exception.
d77c3dfd 758
59ae15a5
PH
759 This exception may be raised by PostProcessor's .run() method to
760 indicate an error in the postprocessing task.
761 """
5f6a1245 762
7851b379
PH
763 def __init__(self, msg):
764 self.msg = msg
d77c3dfd 765
5f6a1245 766
d77c3dfd 767class MaxDownloadsReached(Exception):
59ae15a5
PH
768 """ --max-downloads limit has been reached. """
769 pass
d77c3dfd
FV
770
771
772class UnavailableVideoError(Exception):
59ae15a5 773 """Unavailable Format exception.
d77c3dfd 774
59ae15a5
PH
775 This exception will be thrown when a video is requested
776 in a format that is not available for that video.
777 """
778 pass
d77c3dfd
FV
779
780
781class ContentTooShortError(Exception):
59ae15a5 782 """Content Too Short exception.
d77c3dfd 783
59ae15a5
PH
784 This exception may be raised by FileDownloader objects when a file they
785 download is too small for what the server announced first, indicating
786 the connection was probably interrupted.
787 """
d77c3dfd 788
59ae15a5 789 def __init__(self, downloaded, expected):
2c7ed247 790 # Both in bytes
59ae15a5
PH
791 self.downloaded = downloaded
792 self.expected = expected
d77c3dfd 793
5f6a1245 794
efa97bdc
YCH
795class XAttrMetadataError(Exception):
796 def __init__(self, code=None, msg='Unknown error'):
797 super(XAttrMetadataError, self).__init__(msg)
798 self.code = code
bd264412 799 self.msg = msg
efa97bdc
YCH
800
801 # Parsing code and msg
802 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
803 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
804 self.reason = 'NO_SPACE'
805 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
806 self.reason = 'VALUE_TOO_LONG'
807 else:
808 self.reason = 'NOT_SUPPORTED'
809
810
811class XAttrUnavailableError(Exception):
812 pass
813
814
c5a59d93 815def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
816 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
817 # expected HTTP responses to meet HTTP/1.0 or later (see also
818 # https://github.com/rg3/youtube-dl/issues/6727)
819 if sys.version_info < (3, 0):
5a1a2e94 820 kwargs[b'strict'] = True
be4a824d
PH
821 hc = http_class(*args, **kwargs)
822 source_address = ydl_handler._params.get('source_address')
823 if source_address is not None:
824 sa = (source_address, 0)
825 if hasattr(hc, 'source_address'): # Python 2.7+
826 hc.source_address = sa
827 else: # Python 2.6
828 def _hc_connect(self, *args, **kwargs):
829 sock = compat_socket_create_connection(
830 (self.host, self.port), self.timeout, sa)
831 if is_https:
d7932313
PH
832 self.sock = ssl.wrap_socket(
833 sock, self.key_file, self.cert_file,
834 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
835 else:
836 self.sock = sock
837 hc.connect = functools.partial(_hc_connect, hc)
838
839 return hc
840
841
87f0e62d 842def handle_youtubedl_headers(headers):
992fc9d6
YCH
843 filtered_headers = headers
844
845 if 'Youtubedl-no-compression' in filtered_headers:
846 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 847 del filtered_headers['Youtubedl-no-compression']
87f0e62d 848
992fc9d6 849 return filtered_headers
87f0e62d
YCH
850
851
acebc9cd 852class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
853 """Handler for HTTP requests and responses.
854
855 This class, when installed with an OpenerDirector, automatically adds
856 the standard headers to every HTTP request and handles gzipped and
857 deflated responses from web servers. If compression is to be avoided in
858 a particular request, the original request in the program code only has
0424ec30 859 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
860 removed before making the real request.
861
862 Part of this code was copied from:
863
864 http://techknack.net/python-urllib2-handlers/
865
866 Andrew Rowls, the author of that code, agreed to release it to the
867 public domain.
868 """
869
be4a824d
PH
870 def __init__(self, params, *args, **kwargs):
871 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
872 self._params = params
873
874 def http_open(self, req):
71aff188
YCH
875 conn_class = compat_http_client.HTTPConnection
876
877 socks_proxy = req.headers.get('Ytdl-socks-proxy')
878 if socks_proxy:
879 conn_class = make_socks_conn_class(conn_class, socks_proxy)
880 del req.headers['Ytdl-socks-proxy']
881
be4a824d 882 return self.do_open(functools.partial(
71aff188 883 _create_http_connection, self, conn_class, False),
be4a824d
PH
884 req)
885
59ae15a5
PH
886 @staticmethod
887 def deflate(data):
888 try:
889 return zlib.decompress(data, -zlib.MAX_WBITS)
890 except zlib.error:
891 return zlib.decompress(data)
892
893 @staticmethod
894 def addinfourl_wrapper(stream, headers, url, code):
895 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
896 return compat_urllib_request.addinfourl(stream, headers, url, code)
897 ret = compat_urllib_request.addinfourl(stream, headers, url)
898 ret.code = code
899 return ret
900
acebc9cd 901 def http_request(self, req):
51f267d9
S
902 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
903 # always respected by websites, some tend to give out URLs with non percent-encoded
904 # non-ASCII characters (see telemb.py, ard.py [#3412])
905 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
906 # To work around aforementioned issue we will replace request's original URL with
907 # percent-encoded one
908 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
909 # the code of this workaround has been moved here from YoutubeDL.urlopen()
910 url = req.get_full_url()
911 url_escaped = escape_url(url)
912
913 # Substitute URL if any change after escaping
914 if url != url_escaped:
15d260eb 915 req = update_Request(req, url=url_escaped)
51f267d9 916
33ac271b 917 for h, v in std_headers.items():
3d5f7a39
JK
918 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
919 # The dict keys are capitalized because of this bug by urllib
920 if h.capitalize() not in req.headers:
33ac271b 921 req.add_header(h, v)
87f0e62d
YCH
922
923 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
924
925 if sys.version_info < (2, 7) and '#' in req.get_full_url():
926 # Python 2.6 is brain-dead when it comes to fragments
927 req._Request__original = req._Request__original.partition('#')[0]
928 req._Request__r_type = req._Request__r_type.partition('#')[0]
929
59ae15a5
PH
930 return req
931
acebc9cd 932 def http_response(self, req, resp):
59ae15a5
PH
933 old_resp = resp
934 # gzip
935 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
936 content = resp.read()
937 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
938 try:
939 uncompressed = io.BytesIO(gz.read())
940 except IOError as original_ioerror:
941 # There may be junk add the end of the file
942 # See http://stackoverflow.com/q/4928560/35070 for details
943 for i in range(1, 1024):
944 try:
945 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
946 uncompressed = io.BytesIO(gz.read())
947 except IOError:
948 continue
949 break
950 else:
951 raise original_ioerror
952 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 953 resp.msg = old_resp.msg
c047270c 954 del resp.headers['Content-encoding']
59ae15a5
PH
955 # deflate
956 if resp.headers.get('Content-encoding', '') == 'deflate':
957 gz = io.BytesIO(self.deflate(resp.read()))
958 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
959 resp.msg = old_resp.msg
c047270c 960 del resp.headers['Content-encoding']
ad729172
S
961 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
962 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
963 if 300 <= resp.code < 400:
964 location = resp.headers.get('Location')
965 if location:
966 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
967 if sys.version_info >= (3, 0):
968 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
969 else:
970 location = location.decode('utf-8')
5a4d9ddb
S
971 location_escaped = escape_url(location)
972 if location != location_escaped:
973 del resp.headers['Location']
9a4aec8b
YCH
974 if sys.version_info < (3, 0):
975 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 976 resp.headers['Location'] = location_escaped
59ae15a5 977 return resp
0f8d03f8 978
acebc9cd
PH
979 https_request = http_request
980 https_response = http_response
bf50b038 981
5de90176 982
71aff188
YCH
983def make_socks_conn_class(base_class, socks_proxy):
984 assert issubclass(base_class, (
985 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
986
987 url_components = compat_urlparse.urlparse(socks_proxy)
988 if url_components.scheme.lower() == 'socks5':
989 socks_type = ProxyType.SOCKS5
990 elif url_components.scheme.lower() in ('socks', 'socks4'):
991 socks_type = ProxyType.SOCKS4
51fb4995
YCH
992 elif url_components.scheme.lower() == 'socks4a':
993 socks_type = ProxyType.SOCKS4A
71aff188 994
cdd94c2e
YCH
995 def unquote_if_non_empty(s):
996 if not s:
997 return s
998 return compat_urllib_parse_unquote_plus(s)
999
71aff188
YCH
1000 proxy_args = (
1001 socks_type,
1002 url_components.hostname, url_components.port or 1080,
1003 True, # Remote DNS
cdd94c2e
YCH
1004 unquote_if_non_empty(url_components.username),
1005 unquote_if_non_empty(url_components.password),
71aff188
YCH
1006 )
1007
1008 class SocksConnection(base_class):
1009 def connect(self):
1010 self.sock = sockssocket()
1011 self.sock.setproxy(*proxy_args)
1012 if type(self.timeout) in (int, float):
1013 self.sock.settimeout(self.timeout)
1014 self.sock.connect((self.host, self.port))
1015
1016 if isinstance(self, compat_http_client.HTTPSConnection):
1017 if hasattr(self, '_context'): # Python > 2.6
1018 self.sock = self._context.wrap_socket(
1019 self.sock, server_hostname=self.host)
1020 else:
1021 self.sock = ssl.wrap_socket(self.sock)
1022
1023 return SocksConnection
1024
1025
be4a824d
PH
1026class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1027 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1028 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1029 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1030 self._params = params
1031
1032 def https_open(self, req):
4f264c02 1033 kwargs = {}
71aff188
YCH
1034 conn_class = self._https_conn_class
1035
4f264c02
JMF
1036 if hasattr(self, '_context'): # python > 2.6
1037 kwargs['context'] = self._context
1038 if hasattr(self, '_check_hostname'): # python 3.x
1039 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1040
1041 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1042 if socks_proxy:
1043 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1044 del req.headers['Ytdl-socks-proxy']
1045
be4a824d 1046 return self.do_open(functools.partial(
71aff188 1047 _create_http_connection, self, conn_class, True),
4f264c02 1048 req, **kwargs)
be4a824d
PH
1049
1050
a6420bf5
S
1051class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1052 def __init__(self, cookiejar=None):
1053 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1054
1055 def http_response(self, request, response):
1056 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1057 # characters in Set-Cookie HTTP header of last response (see
1058 # https://github.com/rg3/youtube-dl/issues/6769).
1059 # In order to at least prevent crashing we will percent encode Set-Cookie
1060 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1061 # if sys.version_info < (3, 0) and response.headers:
1062 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1063 # set_cookie = response.headers.get(set_cookie_header)
1064 # if set_cookie:
1065 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1066 # if set_cookie != set_cookie_escaped:
1067 # del response.headers[set_cookie_header]
1068 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1069 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1070
1071 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1072 https_response = http_response
1073
1074
46f59e89
S
1075def extract_timezone(date_str):
1076 m = re.search(
1077 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1078 date_str)
1079 if not m:
1080 timezone = datetime.timedelta()
1081 else:
1082 date_str = date_str[:-len(m.group('tz'))]
1083 if not m.group('sign'):
1084 timezone = datetime.timedelta()
1085 else:
1086 sign = 1 if m.group('sign') == '+' else -1
1087 timezone = datetime.timedelta(
1088 hours=sign * int(m.group('hours')),
1089 minutes=sign * int(m.group('minutes')))
1090 return timezone, date_str
1091
1092
08b38d54 1093def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1094 """ Return a UNIX timestamp from the given date """
1095
1096 if date_str is None:
1097 return None
1098
52c3a6e4
S
1099 date_str = re.sub(r'\.[0-9]+', '', date_str)
1100
08b38d54 1101 if timezone is None:
46f59e89
S
1102 timezone, date_str = extract_timezone(date_str)
1103
52c3a6e4
S
1104 try:
1105 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1106 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1107 return calendar.timegm(dt.timetuple())
1108 except ValueError:
1109 pass
912b38b4
PH
1110
1111
46f59e89
S
1112def date_formats(day_first=True):
1113 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1114
1115
42bdd9d0 1116def unified_strdate(date_str, day_first=True):
bf50b038 1117 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1118
1119 if date_str is None:
1120 return None
bf50b038 1121 upload_date = None
5f6a1245 1122 # Replace commas
026fcc04 1123 date_str = date_str.replace(',', ' ')
42bdd9d0 1124 # Remove AM/PM + timezone
9bb8e0a3 1125 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1126 _, date_str = extract_timezone(date_str)
42bdd9d0 1127
46f59e89 1128 for expression in date_formats(day_first):
bf50b038
JMF
1129 try:
1130 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1131 except ValueError:
bf50b038 1132 pass
42393ce2
PH
1133 if upload_date is None:
1134 timetuple = email.utils.parsedate_tz(date_str)
1135 if timetuple:
c6b9cf05
S
1136 try:
1137 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1138 except ValueError:
1139 pass
6a750402
JMF
1140 if upload_date is not None:
1141 return compat_str(upload_date)
bf50b038 1142
5f6a1245 1143
46f59e89
S
1144def unified_timestamp(date_str, day_first=True):
1145 if date_str is None:
1146 return None
1147
1148 date_str = date_str.replace(',', ' ')
1149
7dc2a74e 1150 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1151 timezone, date_str = extract_timezone(date_str)
1152
1153 # Remove AM/PM + timezone
1154 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1155
1156 for expression in date_formats(day_first):
1157 try:
7dc2a74e 1158 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1159 return calendar.timegm(dt.timetuple())
1160 except ValueError:
1161 pass
1162 timetuple = email.utils.parsedate_tz(date_str)
1163 if timetuple:
7dc2a74e 1164 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1165
1166
28e614de 1167def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1168 if url is None:
1169 return default_ext
9cb9a5df 1170 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1171 if re.match(r'^[A-Za-z0-9]+$', guess):
1172 return guess
a7aaa398
S
1173 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1174 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1175 return guess.rstrip('/')
73e79f2a 1176 else:
cbdbb766 1177 return default_ext
73e79f2a 1178
5f6a1245 1179
d4051a8e 1180def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1181 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1182
5f6a1245 1183
bd558525 1184def date_from_str(date_str):
37254abc
JMF
1185 """
1186 Return a datetime object from a string in the format YYYYMMDD or
1187 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1188 today = datetime.date.today()
f8795e10 1189 if date_str in ('now', 'today'):
37254abc 1190 return today
f8795e10
PH
1191 if date_str == 'yesterday':
1192 return today - datetime.timedelta(days=1)
ec85ded8 1193 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1194 if match is not None:
1195 sign = match.group('sign')
1196 time = int(match.group('time'))
1197 if sign == '-':
1198 time = -time
1199 unit = match.group('unit')
dfb1b146 1200 # A bad approximation?
37254abc
JMF
1201 if unit == 'month':
1202 unit = 'day'
1203 time *= 30
1204 elif unit == 'year':
1205 unit = 'day'
1206 time *= 365
1207 unit += 's'
1208 delta = datetime.timedelta(**{unit: time})
1209 return today + delta
611c1dd9 1210 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1211
1212
e63fc1be 1213def hyphenate_date(date_str):
1214 """
1215 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1216 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1217 if match is not None:
1218 return '-'.join(match.groups())
1219 else:
1220 return date_str
1221
5f6a1245 1222
bd558525
JMF
1223class DateRange(object):
1224 """Represents a time interval between two dates"""
5f6a1245 1225
bd558525
JMF
1226 def __init__(self, start=None, end=None):
1227 """start and end must be strings in the format accepted by date"""
1228 if start is not None:
1229 self.start = date_from_str(start)
1230 else:
1231 self.start = datetime.datetime.min.date()
1232 if end is not None:
1233 self.end = date_from_str(end)
1234 else:
1235 self.end = datetime.datetime.max.date()
37254abc 1236 if self.start > self.end:
bd558525 1237 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1238
bd558525
JMF
1239 @classmethod
1240 def day(cls, day):
1241 """Returns a range that only contains the given day"""
5f6a1245
JW
1242 return cls(day, day)
1243
bd558525
JMF
1244 def __contains__(self, date):
1245 """Check if the date is in the range"""
37254abc
JMF
1246 if not isinstance(date, datetime.date):
1247 date = date_from_str(date)
1248 return self.start <= date <= self.end
5f6a1245 1249
bd558525 1250 def __str__(self):
5f6a1245 1251 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1252
1253
1254def platform_name():
1255 """ Returns the platform name as a compat_str """
1256 res = platform.platform()
1257 if isinstance(res, bytes):
1258 res = res.decode(preferredencoding())
1259
1260 assert isinstance(res, compat_str)
1261 return res
c257baff
PH
1262
1263
b58ddb32
PH
1264def _windows_write_string(s, out):
1265 """ Returns True if the string was written using special methods,
1266 False if it has yet to be written out."""
1267 # Adapted from http://stackoverflow.com/a/3259271/35070
1268
1269 import ctypes
1270 import ctypes.wintypes
1271
1272 WIN_OUTPUT_IDS = {
1273 1: -11,
1274 2: -12,
1275 }
1276
a383a98a
PH
1277 try:
1278 fileno = out.fileno()
1279 except AttributeError:
1280 # If the output stream doesn't have a fileno, it's virtual
1281 return False
aa42e873
PH
1282 except io.UnsupportedOperation:
1283 # Some strange Windows pseudo files?
1284 return False
b58ddb32
PH
1285 if fileno not in WIN_OUTPUT_IDS:
1286 return False
1287
e2f89ec7 1288 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1289 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1290 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1291 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1292
e2f89ec7 1293 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1294 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1295 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1296 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1297 written = ctypes.wintypes.DWORD(0)
1298
611c1dd9 1299 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1300 FILE_TYPE_CHAR = 0x0002
1301 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1302 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1303 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1304 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1305 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1306 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1307
1308 def not_a_console(handle):
1309 if handle == INVALID_HANDLE_VALUE or handle is None:
1310 return True
8fb3ac36
PH
1311 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1312 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1313
1314 if not_a_console(h):
1315 return False
1316
d1b9c912
PH
1317 def next_nonbmp_pos(s):
1318 try:
1319 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1320 except StopIteration:
1321 return len(s)
1322
1323 while s:
1324 count = min(next_nonbmp_pos(s), 1024)
1325
b58ddb32 1326 ret = WriteConsoleW(
d1b9c912 1327 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1328 if ret == 0:
1329 raise OSError('Failed to write string')
d1b9c912
PH
1330 if not count: # We just wrote a non-BMP character
1331 assert written.value == 2
1332 s = s[1:]
1333 else:
1334 assert written.value > 0
1335 s = s[written.value:]
b58ddb32
PH
1336 return True
1337
1338
734f90bb 1339def write_string(s, out=None, encoding=None):
7459e3a2
PH
1340 if out is None:
1341 out = sys.stderr
8bf48f23 1342 assert type(s) == compat_str
7459e3a2 1343
b58ddb32
PH
1344 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1345 if _windows_write_string(s, out):
1346 return
1347
7459e3a2
PH
1348 if ('b' in getattr(out, 'mode', '') or
1349 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1350 byt = s.encode(encoding or preferredencoding(), 'ignore')
1351 out.write(byt)
1352 elif hasattr(out, 'buffer'):
1353 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1354 byt = s.encode(enc, 'ignore')
1355 out.buffer.write(byt)
1356 else:
8bf48f23 1357 out.write(s)
7459e3a2
PH
1358 out.flush()
1359
1360
48ea9cea
PH
1361def bytes_to_intlist(bs):
1362 if not bs:
1363 return []
1364 if isinstance(bs[0], int): # Python 3
1365 return list(bs)
1366 else:
1367 return [ord(c) for c in bs]
1368
c257baff 1369
cba892fa 1370def intlist_to_bytes(xs):
1371 if not xs:
1372 return b''
edaa23f8 1373 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1374
1375
c1c9a79c
PH
1376# Cross-platform file locking
1377if sys.platform == 'win32':
1378 import ctypes.wintypes
1379 import msvcrt
1380
1381 class OVERLAPPED(ctypes.Structure):
1382 _fields_ = [
1383 ('Internal', ctypes.wintypes.LPVOID),
1384 ('InternalHigh', ctypes.wintypes.LPVOID),
1385 ('Offset', ctypes.wintypes.DWORD),
1386 ('OffsetHigh', ctypes.wintypes.DWORD),
1387 ('hEvent', ctypes.wintypes.HANDLE),
1388 ]
1389
1390 kernel32 = ctypes.windll.kernel32
1391 LockFileEx = kernel32.LockFileEx
1392 LockFileEx.argtypes = [
1393 ctypes.wintypes.HANDLE, # hFile
1394 ctypes.wintypes.DWORD, # dwFlags
1395 ctypes.wintypes.DWORD, # dwReserved
1396 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1397 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1398 ctypes.POINTER(OVERLAPPED) # Overlapped
1399 ]
1400 LockFileEx.restype = ctypes.wintypes.BOOL
1401 UnlockFileEx = kernel32.UnlockFileEx
1402 UnlockFileEx.argtypes = [
1403 ctypes.wintypes.HANDLE, # hFile
1404 ctypes.wintypes.DWORD, # dwReserved
1405 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1406 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1407 ctypes.POINTER(OVERLAPPED) # Overlapped
1408 ]
1409 UnlockFileEx.restype = ctypes.wintypes.BOOL
1410 whole_low = 0xffffffff
1411 whole_high = 0x7fffffff
1412
1413 def _lock_file(f, exclusive):
1414 overlapped = OVERLAPPED()
1415 overlapped.Offset = 0
1416 overlapped.OffsetHigh = 0
1417 overlapped.hEvent = 0
1418 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1419 handle = msvcrt.get_osfhandle(f.fileno())
1420 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1421 whole_low, whole_high, f._lock_file_overlapped_p):
1422 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1423
1424 def _unlock_file(f):
1425 assert f._lock_file_overlapped_p
1426 handle = msvcrt.get_osfhandle(f.fileno())
1427 if not UnlockFileEx(handle, 0,
1428 whole_low, whole_high, f._lock_file_overlapped_p):
1429 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1430
1431else:
399a76e6
YCH
1432 # Some platforms, such as Jython, is missing fcntl
1433 try:
1434 import fcntl
c1c9a79c 1435
399a76e6
YCH
1436 def _lock_file(f, exclusive):
1437 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1438
399a76e6
YCH
1439 def _unlock_file(f):
1440 fcntl.flock(f, fcntl.LOCK_UN)
1441 except ImportError:
1442 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1443
1444 def _lock_file(f, exclusive):
1445 raise IOError(UNSUPPORTED_MSG)
1446
1447 def _unlock_file(f):
1448 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1449
1450
1451class locked_file(object):
1452 def __init__(self, filename, mode, encoding=None):
1453 assert mode in ['r', 'a', 'w']
1454 self.f = io.open(filename, mode, encoding=encoding)
1455 self.mode = mode
1456
1457 def __enter__(self):
1458 exclusive = self.mode != 'r'
1459 try:
1460 _lock_file(self.f, exclusive)
1461 except IOError:
1462 self.f.close()
1463 raise
1464 return self
1465
1466 def __exit__(self, etype, value, traceback):
1467 try:
1468 _unlock_file(self.f)
1469 finally:
1470 self.f.close()
1471
1472 def __iter__(self):
1473 return iter(self.f)
1474
1475 def write(self, *args):
1476 return self.f.write(*args)
1477
1478 def read(self, *args):
1479 return self.f.read(*args)
4eb7f1d1
JMF
1480
1481
4644ac55
S
1482def get_filesystem_encoding():
1483 encoding = sys.getfilesystemencoding()
1484 return encoding if encoding is not None else 'utf-8'
1485
1486
4eb7f1d1 1487def shell_quote(args):
a6a173c2 1488 quoted_args = []
4644ac55 1489 encoding = get_filesystem_encoding()
a6a173c2
JMF
1490 for a in args:
1491 if isinstance(a, bytes):
1492 # We may get a filename encoded with 'encodeFilename'
1493 a = a.decode(encoding)
1494 quoted_args.append(pipes.quote(a))
28e614de 1495 return ' '.join(quoted_args)
9d4660ca
PH
1496
1497
1498def smuggle_url(url, data):
1499 """ Pass additional data in a URL for internal use. """
1500
81953d1a
RA
1501 url, idata = unsmuggle_url(url, {})
1502 data.update(idata)
15707c7e 1503 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1504 {'__youtubedl_smuggle': json.dumps(data)})
1505 return url + '#' + sdata
9d4660ca
PH
1506
1507
79f82953 1508def unsmuggle_url(smug_url, default=None):
83e865a3 1509 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1510 return smug_url, default
28e614de
PH
1511 url, _, sdata = smug_url.rpartition('#')
1512 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1513 data = json.loads(jsond)
1514 return url, data
02dbf93f
PH
1515
1516
02dbf93f
PH
1517def format_bytes(bytes):
1518 if bytes is None:
28e614de 1519 return 'N/A'
02dbf93f
PH
1520 if type(bytes) is str:
1521 bytes = float(bytes)
1522 if bytes == 0.0:
1523 exponent = 0
1524 else:
1525 exponent = int(math.log(bytes, 1024.0))
28e614de 1526 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1527 converted = float(bytes) / float(1024 ** exponent)
28e614de 1528 return '%.2f%s' % (converted, suffix)
f53c966a 1529
1c088fa8 1530
fb47597b
S
1531def lookup_unit_table(unit_table, s):
1532 units_re = '|'.join(re.escape(u) for u in unit_table)
1533 m = re.match(
782b1b5b 1534 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1535 if not m:
1536 return None
1537 num_str = m.group('num').replace(',', '.')
1538 mult = unit_table[m.group('unit')]
1539 return int(float(num_str) * mult)
1540
1541
be64b5b0
PH
1542def parse_filesize(s):
1543 if s is None:
1544 return None
1545
dfb1b146 1546 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1547 # but we support those too
1548 _UNIT_TABLE = {
1549 'B': 1,
1550 'b': 1,
70852b47 1551 'bytes': 1,
be64b5b0
PH
1552 'KiB': 1024,
1553 'KB': 1000,
1554 'kB': 1024,
1555 'Kb': 1000,
13585d76 1556 'kb': 1000,
70852b47
YCH
1557 'kilobytes': 1000,
1558 'kibibytes': 1024,
be64b5b0
PH
1559 'MiB': 1024 ** 2,
1560 'MB': 1000 ** 2,
1561 'mB': 1024 ** 2,
1562 'Mb': 1000 ** 2,
13585d76 1563 'mb': 1000 ** 2,
70852b47
YCH
1564 'megabytes': 1000 ** 2,
1565 'mebibytes': 1024 ** 2,
be64b5b0
PH
1566 'GiB': 1024 ** 3,
1567 'GB': 1000 ** 3,
1568 'gB': 1024 ** 3,
1569 'Gb': 1000 ** 3,
13585d76 1570 'gb': 1000 ** 3,
70852b47
YCH
1571 'gigabytes': 1000 ** 3,
1572 'gibibytes': 1024 ** 3,
be64b5b0
PH
1573 'TiB': 1024 ** 4,
1574 'TB': 1000 ** 4,
1575 'tB': 1024 ** 4,
1576 'Tb': 1000 ** 4,
13585d76 1577 'tb': 1000 ** 4,
70852b47
YCH
1578 'terabytes': 1000 ** 4,
1579 'tebibytes': 1024 ** 4,
be64b5b0
PH
1580 'PiB': 1024 ** 5,
1581 'PB': 1000 ** 5,
1582 'pB': 1024 ** 5,
1583 'Pb': 1000 ** 5,
13585d76 1584 'pb': 1000 ** 5,
70852b47
YCH
1585 'petabytes': 1000 ** 5,
1586 'pebibytes': 1024 ** 5,
be64b5b0
PH
1587 'EiB': 1024 ** 6,
1588 'EB': 1000 ** 6,
1589 'eB': 1024 ** 6,
1590 'Eb': 1000 ** 6,
13585d76 1591 'eb': 1000 ** 6,
70852b47
YCH
1592 'exabytes': 1000 ** 6,
1593 'exbibytes': 1024 ** 6,
be64b5b0
PH
1594 'ZiB': 1024 ** 7,
1595 'ZB': 1000 ** 7,
1596 'zB': 1024 ** 7,
1597 'Zb': 1000 ** 7,
13585d76 1598 'zb': 1000 ** 7,
70852b47
YCH
1599 'zettabytes': 1000 ** 7,
1600 'zebibytes': 1024 ** 7,
be64b5b0
PH
1601 'YiB': 1024 ** 8,
1602 'YB': 1000 ** 8,
1603 'yB': 1024 ** 8,
1604 'Yb': 1000 ** 8,
13585d76 1605 'yb': 1000 ** 8,
70852b47
YCH
1606 'yottabytes': 1000 ** 8,
1607 'yobibytes': 1024 ** 8,
be64b5b0
PH
1608 }
1609
fb47597b
S
1610 return lookup_unit_table(_UNIT_TABLE, s)
1611
1612
1613def parse_count(s):
1614 if s is None:
be64b5b0
PH
1615 return None
1616
fb47597b
S
1617 s = s.strip()
1618
1619 if re.match(r'^[\d,.]+$', s):
1620 return str_to_int(s)
1621
1622 _UNIT_TABLE = {
1623 'k': 1000,
1624 'K': 1000,
1625 'm': 1000 ** 2,
1626 'M': 1000 ** 2,
1627 'kk': 1000 ** 2,
1628 'KK': 1000 ** 2,
1629 }
be64b5b0 1630
fb47597b 1631 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1632
2f7ae819 1633
a942d6cb 1634def month_by_name(name, lang='en'):
caefb1de
PH
1635 """ Return the number of a month by (locale-independently) English name """
1636
f6717dec 1637 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1638
caefb1de 1639 try:
f6717dec 1640 return month_names.index(name) + 1
7105440c
YCH
1641 except ValueError:
1642 return None
1643
1644
1645def month_by_abbreviation(abbrev):
1646 """ Return the number of a month by (locale-independently) English
1647 abbreviations """
1648
1649 try:
1650 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1651 except ValueError:
1652 return None
18258362
JMF
1653
1654
5aafe895 1655def fix_xml_ampersands(xml_str):
18258362 1656 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1657 return re.sub(
1658 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1659 '&amp;',
5aafe895 1660 xml_str)
e3946f98
PH
1661
1662
1663def setproctitle(title):
8bf48f23 1664 assert isinstance(title, compat_str)
c1c05c67
YCH
1665
1666 # ctypes in Jython is not complete
1667 # http://bugs.jython.org/issue2148
1668 if sys.platform.startswith('java'):
1669 return
1670
e3946f98 1671 try:
611c1dd9 1672 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1673 except OSError:
1674 return
6eefe533
PH
1675 title_bytes = title.encode('utf-8')
1676 buf = ctypes.create_string_buffer(len(title_bytes))
1677 buf.value = title_bytes
e3946f98 1678 try:
6eefe533 1679 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1680 except AttributeError:
1681 return # Strange libc, just skip this
d7dda168
PH
1682
1683
1684def remove_start(s, start):
46bc9b7d 1685 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1686
1687
2b9faf55 1688def remove_end(s, end):
46bc9b7d 1689 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1690
1691
31b2051e
S
1692def remove_quotes(s):
1693 if s is None or len(s) < 2:
1694 return s
1695 for quote in ('"', "'", ):
1696 if s[0] == quote and s[-1] == quote:
1697 return s[1:-1]
1698 return s
1699
1700
29eb5174 1701def url_basename(url):
9b8aaeed 1702 path = compat_urlparse.urlparse(url).path
28e614de 1703 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1704
1705
02dc0a36
S
1706def base_url(url):
1707 return re.match(r'https?://[^?#&]+/', url).group()
1708
1709
e34c3361
S
1710def urljoin(base, path):
1711 if not isinstance(path, compat_str) or not path:
1712 return None
b0c65c67 1713 if re.match(r'^(?:https?:)?//', path):
e34c3361 1714 return path
b0c65c67 1715 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
e34c3361
S
1716 return None
1717 return compat_urlparse.urljoin(base, path)
1718
1719
aa94a6d3
PH
1720class HEADRequest(compat_urllib_request.Request):
1721 def get_method(self):
611c1dd9 1722 return 'HEAD'
7217e148
PH
1723
1724
95cf60e8
S
1725class PUTRequest(compat_urllib_request.Request):
1726 def get_method(self):
1727 return 'PUT'
1728
1729
9732d77e 1730def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1731 if get_attr:
1732 if v is not None:
1733 v = getattr(v, get_attr, None)
9572013d
PH
1734 if v == '':
1735 v = None
1812afb7
S
1736 if v is None:
1737 return default
1738 try:
1739 return int(v) * invscale // scale
1740 except ValueError:
af98f8ff 1741 return default
9732d77e 1742
9572013d 1743
40a90862
JMF
1744def str_or_none(v, default=None):
1745 return default if v is None else compat_str(v)
1746
9732d77e
PH
1747
1748def str_to_int(int_str):
48d4681e 1749 """ A more relaxed version of int_or_none """
9732d77e
PH
1750 if int_str is None:
1751 return None
28e614de 1752 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1753 return int(int_str)
608d11f5
PH
1754
1755
9732d77e 1756def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1757 if v is None:
1758 return default
1759 try:
1760 return float(v) * invscale / scale
1761 except ValueError:
1762 return default
43f775e4
PH
1763
1764
b72b4431
S
1765def strip_or_none(v):
1766 return None if v is None else v.strip()
1767
1768
608d11f5 1769def parse_duration(s):
8f9312c3 1770 if not isinstance(s, compat_basestring):
608d11f5
PH
1771 return None
1772
ca7b3246
S
1773 s = s.strip()
1774
acaff495 1775 days, hours, mins, secs, ms = [None] * 5
15846398 1776 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1777 if m:
1778 days, hours, mins, secs, ms = m.groups()
1779 else:
1780 m = re.match(
1781 r'''(?ix)(?:P?T)?
8f4b58d7 1782 (?:
acaff495 1783 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1784 )?
acaff495 1785 (?:
1786 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1787 )?
1788 (?:
1789 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1790 )?
1791 (?:
1792 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1793 )?Z?$''', s)
acaff495 1794 if m:
1795 days, hours, mins, secs, ms = m.groups()
1796 else:
15846398 1797 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1798 if m:
1799 hours, mins = m.groups()
1800 else:
1801 return None
1802
1803 duration = 0
1804 if secs:
1805 duration += float(secs)
1806 if mins:
1807 duration += float(mins) * 60
1808 if hours:
1809 duration += float(hours) * 60 * 60
1810 if days:
1811 duration += float(days) * 24 * 60 * 60
1812 if ms:
1813 duration += float(ms)
1814 return duration
91d7d0b3
JMF
1815
1816
e65e4c88 1817def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1818 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1819 return (
1820 '{0}.{1}{2}'.format(name, ext, real_ext)
1821 if not expected_real_ext or real_ext[1:] == expected_real_ext
1822 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1823
1824
b3ed15b7
S
1825def replace_extension(filename, ext, expected_real_ext=None):
1826 name, real_ext = os.path.splitext(filename)
1827 return '{0}.{1}'.format(
1828 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1829 ext)
1830
1831
d70ad093
PH
1832def check_executable(exe, args=[]):
1833 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1834 args can be a list of arguments for a short output (like -version) """
1835 try:
1836 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1837 except OSError:
1838 return False
1839 return exe
b7ab0590
PH
1840
1841
95807118 1842def get_exe_version(exe, args=['--version'],
cae97f65 1843 version_re=None, unrecognized='present'):
95807118
PH
1844 """ Returns the version of the specified executable,
1845 or False if the executable is not present """
1846 try:
b64d04c1
YCH
1847 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1848 # SIGTTOU if youtube-dl is run in the background.
1849 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1850 out, _ = subprocess.Popen(
54116803 1851 [encodeArgument(exe)] + args,
00ca7552 1852 stdin=subprocess.PIPE,
95807118
PH
1853 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1854 except OSError:
1855 return False
cae97f65
PH
1856 if isinstance(out, bytes): # Python 2.x
1857 out = out.decode('ascii', 'ignore')
1858 return detect_exe_version(out, version_re, unrecognized)
1859
1860
1861def detect_exe_version(output, version_re=None, unrecognized='present'):
1862 assert isinstance(output, compat_str)
1863 if version_re is None:
1864 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1865 m = re.search(version_re, output)
95807118
PH
1866 if m:
1867 return m.group(1)
1868 else:
1869 return unrecognized
1870
1871
b7ab0590 1872class PagedList(object):
dd26ced1
PH
1873 def __len__(self):
1874 # This is only useful for tests
1875 return len(self.getslice())
1876
9c44d242
PH
1877
1878class OnDemandPagedList(PagedList):
b95dc034 1879 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1880 self._pagefunc = pagefunc
1881 self._pagesize = pagesize
b95dc034
YCH
1882 self._use_cache = use_cache
1883 if use_cache:
1884 self._cache = {}
9c44d242 1885
b7ab0590
PH
1886 def getslice(self, start=0, end=None):
1887 res = []
1888 for pagenum in itertools.count(start // self._pagesize):
1889 firstid = pagenum * self._pagesize
1890 nextfirstid = pagenum * self._pagesize + self._pagesize
1891 if start >= nextfirstid:
1892 continue
1893
b95dc034
YCH
1894 page_results = None
1895 if self._use_cache:
1896 page_results = self._cache.get(pagenum)
1897 if page_results is None:
1898 page_results = list(self._pagefunc(pagenum))
1899 if self._use_cache:
1900 self._cache[pagenum] = page_results
b7ab0590
PH
1901
1902 startv = (
1903 start % self._pagesize
1904 if firstid <= start < nextfirstid
1905 else 0)
1906
1907 endv = (
1908 ((end - 1) % self._pagesize) + 1
1909 if (end is not None and firstid <= end <= nextfirstid)
1910 else None)
1911
1912 if startv != 0 or endv is not None:
1913 page_results = page_results[startv:endv]
1914 res.extend(page_results)
1915
1916 # A little optimization - if current page is not "full", ie. does
1917 # not contain page_size videos then we can assume that this page
1918 # is the last one - there are no more ids on further pages -
1919 # i.e. no need to query again.
1920 if len(page_results) + startv < self._pagesize:
1921 break
1922
1923 # If we got the whole page, but the next page is not interesting,
1924 # break out early as well
1925 if end == nextfirstid:
1926 break
1927 return res
81c2f20b
PH
1928
1929
9c44d242
PH
1930class InAdvancePagedList(PagedList):
1931 def __init__(self, pagefunc, pagecount, pagesize):
1932 self._pagefunc = pagefunc
1933 self._pagecount = pagecount
1934 self._pagesize = pagesize
1935
1936 def getslice(self, start=0, end=None):
1937 res = []
1938 start_page = start // self._pagesize
1939 end_page = (
1940 self._pagecount if end is None else (end // self._pagesize + 1))
1941 skip_elems = start - start_page * self._pagesize
1942 only_more = None if end is None else end - start
1943 for pagenum in range(start_page, end_page):
1944 page = list(self._pagefunc(pagenum))
1945 if skip_elems:
1946 page = page[skip_elems:]
1947 skip_elems = None
1948 if only_more is not None:
1949 if len(page) < only_more:
1950 only_more -= len(page)
1951 else:
1952 page = page[:only_more]
1953 res.extend(page)
1954 break
1955 res.extend(page)
1956 return res
1957
1958
81c2f20b 1959def uppercase_escape(s):
676eb3f2 1960 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1961 return re.sub(
a612753d 1962 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1963 lambda m: unicode_escape(m.group(0))[0],
1964 s)
0fe2ff78
YCH
1965
1966
1967def lowercase_escape(s):
1968 unicode_escape = codecs.getdecoder('unicode_escape')
1969 return re.sub(
1970 r'\\u[0-9a-fA-F]{4}',
1971 lambda m: unicode_escape(m.group(0))[0],
1972 s)
b53466e1 1973
d05cfe06
S
1974
1975def escape_rfc3986(s):
1976 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1977 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1978 s = s.encode('utf-8')
ecc0c5ee 1979 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1980
1981
1982def escape_url(url):
1983 """Escape URL as suggested by RFC 3986"""
1984 url_parsed = compat_urllib_parse_urlparse(url)
1985 return url_parsed._replace(
efbed08d 1986 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1987 path=escape_rfc3986(url_parsed.path),
1988 params=escape_rfc3986(url_parsed.params),
1989 query=escape_rfc3986(url_parsed.query),
1990 fragment=escape_rfc3986(url_parsed.fragment)
1991 ).geturl()
1992
62e609ab
PH
1993
1994def read_batch_urls(batch_fd):
1995 def fixup(url):
1996 if not isinstance(url, compat_str):
1997 url = url.decode('utf-8', 'replace')
28e614de 1998 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1999 if url.startswith(BOM_UTF8):
2000 url = url[len(BOM_UTF8):]
2001 url = url.strip()
2002 if url.startswith(('#', ';', ']')):
2003 return False
2004 return url
2005
2006 with contextlib.closing(batch_fd) as fd:
2007 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2008
2009
2010def urlencode_postdata(*args, **kargs):
15707c7e 2011 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2012
2013
38f9ef31 2014def update_url_query(url, query):
cacd9966
YCH
2015 if not query:
2016 return url
38f9ef31 2017 parsed_url = compat_urlparse.urlparse(url)
2018 qs = compat_parse_qs(parsed_url.query)
2019 qs.update(query)
2020 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2021 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2022
8e60dc75 2023
ed0291d1
S
2024def update_Request(req, url=None, data=None, headers={}, query={}):
2025 req_headers = req.headers.copy()
2026 req_headers.update(headers)
2027 req_data = data or req.data
2028 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2029 req_get_method = req.get_method()
2030 if req_get_method == 'HEAD':
2031 req_type = HEADRequest
2032 elif req_get_method == 'PUT':
2033 req_type = PUTRequest
2034 else:
2035 req_type = compat_urllib_request.Request
ed0291d1
S
2036 new_req = req_type(
2037 req_url, data=req_data, headers=req_headers,
2038 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2039 if hasattr(req, 'timeout'):
2040 new_req.timeout = req.timeout
2041 return new_req
2042
2043
86296ad2 2044def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2045 if isinstance(key_or_keys, (list, tuple)):
2046 for key in key_or_keys:
86296ad2
S
2047 if key not in d or d[key] is None or skip_false_values and not d[key]:
2048 continue
2049 return d[key]
cbecc9b9
S
2050 return default
2051 return d.get(key_or_keys, default)
2052
2053
329ca3be
S
2054def try_get(src, getter, expected_type=None):
2055 try:
2056 v = getter(src)
2057 except (AttributeError, KeyError, TypeError, IndexError):
2058 pass
2059 else:
2060 if expected_type is None or isinstance(v, expected_type):
2061 return v
2062
2063
8e60dc75
S
2064def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2065 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2066
16392824 2067
a1a530b0
PH
2068US_RATINGS = {
2069 'G': 0,
2070 'PG': 10,
2071 'PG-13': 13,
2072 'R': 16,
2073 'NC': 18,
2074}
fac55558
PH
2075
2076
a8795327
S
2077TV_PARENTAL_GUIDELINES = {
2078 'TV-Y': 0,
2079 'TV-Y7': 7,
2080 'TV-G': 0,
2081 'TV-PG': 0,
2082 'TV-14': 14,
2083 'TV-MA': 17,
2084}
2085
2086
146c80e2 2087def parse_age_limit(s):
a8795327
S
2088 if type(s) == int:
2089 return s if 0 <= s <= 21 else None
2090 if not isinstance(s, compat_basestring):
d838b1bd 2091 return None
146c80e2 2092 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2093 if m:
2094 return int(m.group('age'))
2095 if s in US_RATINGS:
2096 return US_RATINGS[s]
2097 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2098
2099
fac55558 2100def strip_jsonp(code):
609a61e3 2101 return re.sub(
5950cb1d 2102 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2103
2104
e05f6939
PH
2105def js_to_json(code):
2106 def fix_kv(m):
e7b6d122
PH
2107 v = m.group(0)
2108 if v in ('true', 'false', 'null'):
2109 return v
b3ee552e 2110 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2111 return ""
2112
2113 if v[0] in ("'", '"'):
2114 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2115 '"': '\\"',
bd1e4844 2116 "\\'": "'",
2117 '\\\n': '',
2118 '\\x': '\\u00',
2119 }.get(m.group(0), m.group(0)), v[1:-1])
2120
89ac4a19 2121 INTEGER_TABLE = (
e4659b45
YCH
2122 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2123 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2124 )
2125
2126 for regex, base in INTEGER_TABLE:
2127 im = re.match(regex, v)
2128 if im:
e4659b45 2129 i = int(im.group(1), base)
89ac4a19
S
2130 return '"%d":' % i if v.endswith(':') else '%d' % i
2131
e7b6d122 2132 return '"%s"' % v
e05f6939 2133
bd1e4844 2134 return re.sub(r'''(?sx)
2135 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2136 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
b3ee552e 2137 /\*.*?\*/|//[^\n]*|,(?=\s*[\]}])|
bd1e4844 2138 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2139 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2140 [0-9]+(?=\s*:)
e05f6939 2141 ''', fix_kv, code)
e05f6939
PH
2142
2143
478c2c61
PH
2144def qualities(quality_ids):
2145 """ Get a numeric quality value out of a list of possible values """
2146 def q(qid):
2147 try:
2148 return quality_ids.index(qid)
2149 except ValueError:
2150 return -1
2151 return q
2152
acd69589
PH
2153
2154DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2155
a020a0dc
PH
2156
2157def limit_length(s, length):
2158 """ Add ellipses to overly long strings """
2159 if s is None:
2160 return None
2161 ELLIPSES = '...'
2162 if len(s) > length:
2163 return s[:length - len(ELLIPSES)] + ELLIPSES
2164 return s
48844745
PH
2165
2166
2167def version_tuple(v):
5f9b8394 2168 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2169
2170
2171def is_outdated_version(version, limit, assume_new=True):
2172 if not version:
2173 return not assume_new
2174 try:
2175 return version_tuple(version) < version_tuple(limit)
2176 except ValueError:
2177 return not assume_new
732ea2f0
PH
2178
2179
2180def ytdl_is_updateable():
2181 """ Returns if youtube-dl can be updated with -U """
2182 from zipimport import zipimporter
2183
2184 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2185
2186
2187def args_to_str(args):
2188 # Get a short string representation for a subprocess command
702ccf2d 2189 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2190
2191
9b9c5355 2192def error_to_compat_str(err):
fdae2358
S
2193 err_str = str(err)
2194 # On python 2 error byte string must be decoded with proper
2195 # encoding rather than ascii
2196 if sys.version_info[0] < 3:
2197 err_str = err_str.decode(preferredencoding())
2198 return err_str
2199
2200
c460bdd5 2201def mimetype2ext(mt):
eb9ee194
S
2202 if mt is None:
2203 return None
2204
765ac263
JMF
2205 ext = {
2206 'audio/mp4': 'm4a',
6c33d24b
YCH
2207 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2208 # it's the most popular one
2209 'audio/mpeg': 'mp3',
765ac263
JMF
2210 }.get(mt)
2211 if ext is not None:
2212 return ext
2213
c460bdd5 2214 _, _, res = mt.rpartition('/')
6562d34a 2215 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2216
2217 return {
f6861ec9 2218 '3gpp': '3gp',
cafcf657 2219 'smptett+xml': 'tt',
2220 'srt': 'srt',
2221 'ttaf+xml': 'dfxp',
a0d8d704 2222 'ttml+xml': 'ttml',
cafcf657 2223 'vtt': 'vtt',
f6861ec9 2224 'x-flv': 'flv',
a0d8d704
YCH
2225 'x-mp4-fragmented': 'mp4',
2226 'x-ms-wmv': 'wmv',
b4173f15
RA
2227 'mpegurl': 'm3u8',
2228 'x-mpegurl': 'm3u8',
2229 'vnd.apple.mpegurl': 'm3u8',
2230 'dash+xml': 'mpd',
2231 'f4m': 'f4m',
2232 'f4m+xml': 'f4m',
f164b971 2233 'hds+xml': 'f4m',
e910fe2f 2234 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2235 'quicktime': 'mov',
c460bdd5
PH
2236 }.get(res, res)
2237
2238
4f3c5e06 2239def parse_codecs(codecs_str):
2240 # http://tools.ietf.org/html/rfc6381
2241 if not codecs_str:
2242 return {}
2243 splited_codecs = list(filter(None, map(
2244 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2245 vcodec, acodec = None, None
2246 for full_codec in splited_codecs:
2247 codec = full_codec.split('.')[0]
2248 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2249 if not vcodec:
2250 vcodec = full_codec
073ac122 2251 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2252 if not acodec:
2253 acodec = full_codec
2254 else:
2255 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2256 if not vcodec and not acodec:
2257 if len(splited_codecs) == 2:
2258 return {
2259 'vcodec': vcodec,
2260 'acodec': acodec,
2261 }
2262 elif len(splited_codecs) == 1:
2263 return {
2264 'vcodec': 'none',
2265 'acodec': vcodec,
2266 }
2267 else:
2268 return {
2269 'vcodec': vcodec or 'none',
2270 'acodec': acodec or 'none',
2271 }
2272 return {}
2273
2274
2ccd1b10 2275def urlhandle_detect_ext(url_handle):
79298173 2276 getheader = url_handle.headers.get
2ccd1b10 2277
b55ee18f
PH
2278 cd = getheader('Content-Disposition')
2279 if cd:
2280 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2281 if m:
2282 e = determine_ext(m.group('filename'), default_ext=None)
2283 if e:
2284 return e
2285
c460bdd5 2286 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2287
2288
1e399778
YCH
2289def encode_data_uri(data, mime_type):
2290 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2291
2292
05900629 2293def age_restricted(content_limit, age_limit):
6ec6cb4e 2294 """ Returns True iff the content should be blocked """
05900629
PH
2295
2296 if age_limit is None: # No limit set
2297 return False
2298 if content_limit is None:
2299 return False # Content available for everyone
2300 return age_limit < content_limit
61ca9a80
PH
2301
2302
2303def is_html(first_bytes):
2304 """ Detect whether a file contains HTML by examining its first bytes. """
2305
2306 BOMS = [
2307 (b'\xef\xbb\xbf', 'utf-8'),
2308 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2309 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2310 (b'\xff\xfe', 'utf-16-le'),
2311 (b'\xfe\xff', 'utf-16-be'),
2312 ]
2313 for bom, enc in BOMS:
2314 if first_bytes.startswith(bom):
2315 s = first_bytes[len(bom):].decode(enc, 'replace')
2316 break
2317 else:
2318 s = first_bytes.decode('utf-8', 'replace')
2319
2320 return re.match(r'^\s*<', s)
a055469f
PH
2321
2322
2323def determine_protocol(info_dict):
2324 protocol = info_dict.get('protocol')
2325 if protocol is not None:
2326 return protocol
2327
2328 url = info_dict['url']
2329 if url.startswith('rtmp'):
2330 return 'rtmp'
2331 elif url.startswith('mms'):
2332 return 'mms'
2333 elif url.startswith('rtsp'):
2334 return 'rtsp'
2335
2336 ext = determine_ext(url)
2337 if ext == 'm3u8':
2338 return 'm3u8'
2339 elif ext == 'f4m':
2340 return 'f4m'
2341
2342 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2343
2344
2345def render_table(header_row, data):
2346 """ Render a list of rows, each as a list of values """
2347 table = [header_row] + data
2348 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2349 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2350 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2351
2352
2353def _match_one(filter_part, dct):
2354 COMPARISON_OPERATORS = {
2355 '<': operator.lt,
2356 '<=': operator.le,
2357 '>': operator.gt,
2358 '>=': operator.ge,
2359 '=': operator.eq,
2360 '!=': operator.ne,
2361 }
2362 operator_rex = re.compile(r'''(?x)\s*
2363 (?P<key>[a-z_]+)
2364 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2365 (?:
2366 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2367 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2368 )
2369 \s*$
2370 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2371 m = operator_rex.search(filter_part)
2372 if m:
2373 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc
S
2374 actual_value = dct.get(m.group('key'))
2375 if (m.group('strval') is not None or
2376 # If the original field is a string and matching comparisonvalue is
2377 # a number we should respect the origin of the original field
2378 # and process comparison value as a string (see
2379 # https://github.com/rg3/youtube-dl/issues/11082).
2380 actual_value is not None and m.group('intval') is not None and
2381 isinstance(actual_value, compat_str)):
347de493
PH
2382 if m.group('op') not in ('=', '!='):
2383 raise ValueError(
2384 'Operator %s does not support string values!' % m.group('op'))
e5a088dc 2385 comparison_value = m.group('strval') or m.group('intval')
347de493
PH
2386 else:
2387 try:
2388 comparison_value = int(m.group('intval'))
2389 except ValueError:
2390 comparison_value = parse_filesize(m.group('intval'))
2391 if comparison_value is None:
2392 comparison_value = parse_filesize(m.group('intval') + 'B')
2393 if comparison_value is None:
2394 raise ValueError(
2395 'Invalid integer value %r in filter part %r' % (
2396 m.group('intval'), filter_part))
347de493
PH
2397 if actual_value is None:
2398 return m.group('none_inclusive')
2399 return op(actual_value, comparison_value)
2400
2401 UNARY_OPERATORS = {
2402 '': lambda v: v is not None,
2403 '!': lambda v: v is None,
2404 }
2405 operator_rex = re.compile(r'''(?x)\s*
2406 (?P<op>%s)\s*(?P<key>[a-z_]+)
2407 \s*$
2408 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2409 m = operator_rex.search(filter_part)
2410 if m:
2411 op = UNARY_OPERATORS[m.group('op')]
2412 actual_value = dct.get(m.group('key'))
2413 return op(actual_value)
2414
2415 raise ValueError('Invalid filter part %r' % filter_part)
2416
2417
2418def match_str(filter_str, dct):
2419 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2420
2421 return all(
2422 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2423
2424
2425def match_filter_func(filter_str):
2426 def _match_func(info_dict):
2427 if match_str(filter_str, info_dict):
2428 return None
2429 else:
2430 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2431 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2432 return _match_func
91410c9b
PH
2433
2434
bf6427d2
YCH
2435def parse_dfxp_time_expr(time_expr):
2436 if not time_expr:
d631d5f9 2437 return
bf6427d2
YCH
2438
2439 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2440 if mobj:
2441 return float(mobj.group('time_offset'))
2442
db2fe38b 2443 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2444 if mobj:
db2fe38b 2445 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2446
2447
c1c924ab
YCH
2448def srt_subtitles_timecode(seconds):
2449 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2450
2451
2452def dfxp2srt(dfxp_data):
4e335771
YCH
2453 _x = functools.partial(xpath_with_ns, ns_map={
2454 'ttml': 'http://www.w3.org/ns/ttml',
2455 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2456 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2457 })
bf6427d2 2458
87de7069 2459 class TTMLPElementParser(object):
2b14cb56 2460 out = ''
bf6427d2 2461
2b14cb56 2462 def start(self, tag, attrib):
2463 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2464 self.out += '\n'
bf6427d2 2465
2b14cb56 2466 def end(self, tag):
2467 pass
bf6427d2 2468
2b14cb56 2469 def data(self, data):
2470 self.out += data
2471
2472 def close(self):
2473 return self.out.strip()
2474
2475 def parse_node(node):
2476 target = TTMLPElementParser()
2477 parser = xml.etree.ElementTree.XMLParser(target=target)
2478 parser.feed(xml.etree.ElementTree.tostring(node))
2479 return parser.close()
bf6427d2 2480
36e6f62c 2481 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2482 out = []
5bf28d78 2483 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2484
2485 if not paras:
2486 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2487
2488 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2489 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2490 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2491 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2492 if begin_time is None:
2493 continue
7dff0363 2494 if not end_time:
d631d5f9
YCH
2495 if not dur:
2496 continue
2497 end_time = begin_time + dur
bf6427d2
YCH
2498 out.append('%d\n%s --> %s\n%s\n\n' % (
2499 index,
c1c924ab
YCH
2500 srt_subtitles_timecode(begin_time),
2501 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2502 parse_node(para)))
2503
2504 return ''.join(out)
2505
2506
66e289ba
S
2507def cli_option(params, command_option, param):
2508 param = params.get(param)
98e698f1
RA
2509 if param:
2510 param = compat_str(param)
66e289ba
S
2511 return [command_option, param] if param is not None else []
2512
2513
2514def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2515 param = params.get(param)
2516 assert isinstance(param, bool)
2517 if separator:
2518 return [command_option + separator + (true_value if param else false_value)]
2519 return [command_option, true_value if param else false_value]
2520
2521
2522def cli_valueless_option(params, command_option, param, expected_value=True):
2523 param = params.get(param)
2524 return [command_option] if param == expected_value else []
2525
2526
2527def cli_configuration_args(params, param, default=[]):
2528 ex_args = params.get(param)
2529 if ex_args is None:
2530 return default
2531 assert isinstance(ex_args, list)
2532 return ex_args
2533
2534
39672624
YCH
2535class ISO639Utils(object):
2536 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2537 _lang_map = {
2538 'aa': 'aar',
2539 'ab': 'abk',
2540 'ae': 'ave',
2541 'af': 'afr',
2542 'ak': 'aka',
2543 'am': 'amh',
2544 'an': 'arg',
2545 'ar': 'ara',
2546 'as': 'asm',
2547 'av': 'ava',
2548 'ay': 'aym',
2549 'az': 'aze',
2550 'ba': 'bak',
2551 'be': 'bel',
2552 'bg': 'bul',
2553 'bh': 'bih',
2554 'bi': 'bis',
2555 'bm': 'bam',
2556 'bn': 'ben',
2557 'bo': 'bod',
2558 'br': 'bre',
2559 'bs': 'bos',
2560 'ca': 'cat',
2561 'ce': 'che',
2562 'ch': 'cha',
2563 'co': 'cos',
2564 'cr': 'cre',
2565 'cs': 'ces',
2566 'cu': 'chu',
2567 'cv': 'chv',
2568 'cy': 'cym',
2569 'da': 'dan',
2570 'de': 'deu',
2571 'dv': 'div',
2572 'dz': 'dzo',
2573 'ee': 'ewe',
2574 'el': 'ell',
2575 'en': 'eng',
2576 'eo': 'epo',
2577 'es': 'spa',
2578 'et': 'est',
2579 'eu': 'eus',
2580 'fa': 'fas',
2581 'ff': 'ful',
2582 'fi': 'fin',
2583 'fj': 'fij',
2584 'fo': 'fao',
2585 'fr': 'fra',
2586 'fy': 'fry',
2587 'ga': 'gle',
2588 'gd': 'gla',
2589 'gl': 'glg',
2590 'gn': 'grn',
2591 'gu': 'guj',
2592 'gv': 'glv',
2593 'ha': 'hau',
2594 'he': 'heb',
2595 'hi': 'hin',
2596 'ho': 'hmo',
2597 'hr': 'hrv',
2598 'ht': 'hat',
2599 'hu': 'hun',
2600 'hy': 'hye',
2601 'hz': 'her',
2602 'ia': 'ina',
2603 'id': 'ind',
2604 'ie': 'ile',
2605 'ig': 'ibo',
2606 'ii': 'iii',
2607 'ik': 'ipk',
2608 'io': 'ido',
2609 'is': 'isl',
2610 'it': 'ita',
2611 'iu': 'iku',
2612 'ja': 'jpn',
2613 'jv': 'jav',
2614 'ka': 'kat',
2615 'kg': 'kon',
2616 'ki': 'kik',
2617 'kj': 'kua',
2618 'kk': 'kaz',
2619 'kl': 'kal',
2620 'km': 'khm',
2621 'kn': 'kan',
2622 'ko': 'kor',
2623 'kr': 'kau',
2624 'ks': 'kas',
2625 'ku': 'kur',
2626 'kv': 'kom',
2627 'kw': 'cor',
2628 'ky': 'kir',
2629 'la': 'lat',
2630 'lb': 'ltz',
2631 'lg': 'lug',
2632 'li': 'lim',
2633 'ln': 'lin',
2634 'lo': 'lao',
2635 'lt': 'lit',
2636 'lu': 'lub',
2637 'lv': 'lav',
2638 'mg': 'mlg',
2639 'mh': 'mah',
2640 'mi': 'mri',
2641 'mk': 'mkd',
2642 'ml': 'mal',
2643 'mn': 'mon',
2644 'mr': 'mar',
2645 'ms': 'msa',
2646 'mt': 'mlt',
2647 'my': 'mya',
2648 'na': 'nau',
2649 'nb': 'nob',
2650 'nd': 'nde',
2651 'ne': 'nep',
2652 'ng': 'ndo',
2653 'nl': 'nld',
2654 'nn': 'nno',
2655 'no': 'nor',
2656 'nr': 'nbl',
2657 'nv': 'nav',
2658 'ny': 'nya',
2659 'oc': 'oci',
2660 'oj': 'oji',
2661 'om': 'orm',
2662 'or': 'ori',
2663 'os': 'oss',
2664 'pa': 'pan',
2665 'pi': 'pli',
2666 'pl': 'pol',
2667 'ps': 'pus',
2668 'pt': 'por',
2669 'qu': 'que',
2670 'rm': 'roh',
2671 'rn': 'run',
2672 'ro': 'ron',
2673 'ru': 'rus',
2674 'rw': 'kin',
2675 'sa': 'san',
2676 'sc': 'srd',
2677 'sd': 'snd',
2678 'se': 'sme',
2679 'sg': 'sag',
2680 'si': 'sin',
2681 'sk': 'slk',
2682 'sl': 'slv',
2683 'sm': 'smo',
2684 'sn': 'sna',
2685 'so': 'som',
2686 'sq': 'sqi',
2687 'sr': 'srp',
2688 'ss': 'ssw',
2689 'st': 'sot',
2690 'su': 'sun',
2691 'sv': 'swe',
2692 'sw': 'swa',
2693 'ta': 'tam',
2694 'te': 'tel',
2695 'tg': 'tgk',
2696 'th': 'tha',
2697 'ti': 'tir',
2698 'tk': 'tuk',
2699 'tl': 'tgl',
2700 'tn': 'tsn',
2701 'to': 'ton',
2702 'tr': 'tur',
2703 'ts': 'tso',
2704 'tt': 'tat',
2705 'tw': 'twi',
2706 'ty': 'tah',
2707 'ug': 'uig',
2708 'uk': 'ukr',
2709 'ur': 'urd',
2710 'uz': 'uzb',
2711 've': 'ven',
2712 'vi': 'vie',
2713 'vo': 'vol',
2714 'wa': 'wln',
2715 'wo': 'wol',
2716 'xh': 'xho',
2717 'yi': 'yid',
2718 'yo': 'yor',
2719 'za': 'zha',
2720 'zh': 'zho',
2721 'zu': 'zul',
2722 }
2723
2724 @classmethod
2725 def short2long(cls, code):
2726 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2727 return cls._lang_map.get(code[:2])
2728
2729 @classmethod
2730 def long2short(cls, code):
2731 """Convert language code from ISO 639-2/T to ISO 639-1"""
2732 for short_name, long_name in cls._lang_map.items():
2733 if long_name == code:
2734 return short_name
2735
2736
4eb10f66
YCH
2737class ISO3166Utils(object):
2738 # From http://data.okfn.org/data/core/country-list
2739 _country_map = {
2740 'AF': 'Afghanistan',
2741 'AX': 'Åland Islands',
2742 'AL': 'Albania',
2743 'DZ': 'Algeria',
2744 'AS': 'American Samoa',
2745 'AD': 'Andorra',
2746 'AO': 'Angola',
2747 'AI': 'Anguilla',
2748 'AQ': 'Antarctica',
2749 'AG': 'Antigua and Barbuda',
2750 'AR': 'Argentina',
2751 'AM': 'Armenia',
2752 'AW': 'Aruba',
2753 'AU': 'Australia',
2754 'AT': 'Austria',
2755 'AZ': 'Azerbaijan',
2756 'BS': 'Bahamas',
2757 'BH': 'Bahrain',
2758 'BD': 'Bangladesh',
2759 'BB': 'Barbados',
2760 'BY': 'Belarus',
2761 'BE': 'Belgium',
2762 'BZ': 'Belize',
2763 'BJ': 'Benin',
2764 'BM': 'Bermuda',
2765 'BT': 'Bhutan',
2766 'BO': 'Bolivia, Plurinational State of',
2767 'BQ': 'Bonaire, Sint Eustatius and Saba',
2768 'BA': 'Bosnia and Herzegovina',
2769 'BW': 'Botswana',
2770 'BV': 'Bouvet Island',
2771 'BR': 'Brazil',
2772 'IO': 'British Indian Ocean Territory',
2773 'BN': 'Brunei Darussalam',
2774 'BG': 'Bulgaria',
2775 'BF': 'Burkina Faso',
2776 'BI': 'Burundi',
2777 'KH': 'Cambodia',
2778 'CM': 'Cameroon',
2779 'CA': 'Canada',
2780 'CV': 'Cape Verde',
2781 'KY': 'Cayman Islands',
2782 'CF': 'Central African Republic',
2783 'TD': 'Chad',
2784 'CL': 'Chile',
2785 'CN': 'China',
2786 'CX': 'Christmas Island',
2787 'CC': 'Cocos (Keeling) Islands',
2788 'CO': 'Colombia',
2789 'KM': 'Comoros',
2790 'CG': 'Congo',
2791 'CD': 'Congo, the Democratic Republic of the',
2792 'CK': 'Cook Islands',
2793 'CR': 'Costa Rica',
2794 'CI': 'Côte d\'Ivoire',
2795 'HR': 'Croatia',
2796 'CU': 'Cuba',
2797 'CW': 'Curaçao',
2798 'CY': 'Cyprus',
2799 'CZ': 'Czech Republic',
2800 'DK': 'Denmark',
2801 'DJ': 'Djibouti',
2802 'DM': 'Dominica',
2803 'DO': 'Dominican Republic',
2804 'EC': 'Ecuador',
2805 'EG': 'Egypt',
2806 'SV': 'El Salvador',
2807 'GQ': 'Equatorial Guinea',
2808 'ER': 'Eritrea',
2809 'EE': 'Estonia',
2810 'ET': 'Ethiopia',
2811 'FK': 'Falkland Islands (Malvinas)',
2812 'FO': 'Faroe Islands',
2813 'FJ': 'Fiji',
2814 'FI': 'Finland',
2815 'FR': 'France',
2816 'GF': 'French Guiana',
2817 'PF': 'French Polynesia',
2818 'TF': 'French Southern Territories',
2819 'GA': 'Gabon',
2820 'GM': 'Gambia',
2821 'GE': 'Georgia',
2822 'DE': 'Germany',
2823 'GH': 'Ghana',
2824 'GI': 'Gibraltar',
2825 'GR': 'Greece',
2826 'GL': 'Greenland',
2827 'GD': 'Grenada',
2828 'GP': 'Guadeloupe',
2829 'GU': 'Guam',
2830 'GT': 'Guatemala',
2831 'GG': 'Guernsey',
2832 'GN': 'Guinea',
2833 'GW': 'Guinea-Bissau',
2834 'GY': 'Guyana',
2835 'HT': 'Haiti',
2836 'HM': 'Heard Island and McDonald Islands',
2837 'VA': 'Holy See (Vatican City State)',
2838 'HN': 'Honduras',
2839 'HK': 'Hong Kong',
2840 'HU': 'Hungary',
2841 'IS': 'Iceland',
2842 'IN': 'India',
2843 'ID': 'Indonesia',
2844 'IR': 'Iran, Islamic Republic of',
2845 'IQ': 'Iraq',
2846 'IE': 'Ireland',
2847 'IM': 'Isle of Man',
2848 'IL': 'Israel',
2849 'IT': 'Italy',
2850 'JM': 'Jamaica',
2851 'JP': 'Japan',
2852 'JE': 'Jersey',
2853 'JO': 'Jordan',
2854 'KZ': 'Kazakhstan',
2855 'KE': 'Kenya',
2856 'KI': 'Kiribati',
2857 'KP': 'Korea, Democratic People\'s Republic of',
2858 'KR': 'Korea, Republic of',
2859 'KW': 'Kuwait',
2860 'KG': 'Kyrgyzstan',
2861 'LA': 'Lao People\'s Democratic Republic',
2862 'LV': 'Latvia',
2863 'LB': 'Lebanon',
2864 'LS': 'Lesotho',
2865 'LR': 'Liberia',
2866 'LY': 'Libya',
2867 'LI': 'Liechtenstein',
2868 'LT': 'Lithuania',
2869 'LU': 'Luxembourg',
2870 'MO': 'Macao',
2871 'MK': 'Macedonia, the Former Yugoslav Republic of',
2872 'MG': 'Madagascar',
2873 'MW': 'Malawi',
2874 'MY': 'Malaysia',
2875 'MV': 'Maldives',
2876 'ML': 'Mali',
2877 'MT': 'Malta',
2878 'MH': 'Marshall Islands',
2879 'MQ': 'Martinique',
2880 'MR': 'Mauritania',
2881 'MU': 'Mauritius',
2882 'YT': 'Mayotte',
2883 'MX': 'Mexico',
2884 'FM': 'Micronesia, Federated States of',
2885 'MD': 'Moldova, Republic of',
2886 'MC': 'Monaco',
2887 'MN': 'Mongolia',
2888 'ME': 'Montenegro',
2889 'MS': 'Montserrat',
2890 'MA': 'Morocco',
2891 'MZ': 'Mozambique',
2892 'MM': 'Myanmar',
2893 'NA': 'Namibia',
2894 'NR': 'Nauru',
2895 'NP': 'Nepal',
2896 'NL': 'Netherlands',
2897 'NC': 'New Caledonia',
2898 'NZ': 'New Zealand',
2899 'NI': 'Nicaragua',
2900 'NE': 'Niger',
2901 'NG': 'Nigeria',
2902 'NU': 'Niue',
2903 'NF': 'Norfolk Island',
2904 'MP': 'Northern Mariana Islands',
2905 'NO': 'Norway',
2906 'OM': 'Oman',
2907 'PK': 'Pakistan',
2908 'PW': 'Palau',
2909 'PS': 'Palestine, State of',
2910 'PA': 'Panama',
2911 'PG': 'Papua New Guinea',
2912 'PY': 'Paraguay',
2913 'PE': 'Peru',
2914 'PH': 'Philippines',
2915 'PN': 'Pitcairn',
2916 'PL': 'Poland',
2917 'PT': 'Portugal',
2918 'PR': 'Puerto Rico',
2919 'QA': 'Qatar',
2920 'RE': 'Réunion',
2921 'RO': 'Romania',
2922 'RU': 'Russian Federation',
2923 'RW': 'Rwanda',
2924 'BL': 'Saint Barthélemy',
2925 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2926 'KN': 'Saint Kitts and Nevis',
2927 'LC': 'Saint Lucia',
2928 'MF': 'Saint Martin (French part)',
2929 'PM': 'Saint Pierre and Miquelon',
2930 'VC': 'Saint Vincent and the Grenadines',
2931 'WS': 'Samoa',
2932 'SM': 'San Marino',
2933 'ST': 'Sao Tome and Principe',
2934 'SA': 'Saudi Arabia',
2935 'SN': 'Senegal',
2936 'RS': 'Serbia',
2937 'SC': 'Seychelles',
2938 'SL': 'Sierra Leone',
2939 'SG': 'Singapore',
2940 'SX': 'Sint Maarten (Dutch part)',
2941 'SK': 'Slovakia',
2942 'SI': 'Slovenia',
2943 'SB': 'Solomon Islands',
2944 'SO': 'Somalia',
2945 'ZA': 'South Africa',
2946 'GS': 'South Georgia and the South Sandwich Islands',
2947 'SS': 'South Sudan',
2948 'ES': 'Spain',
2949 'LK': 'Sri Lanka',
2950 'SD': 'Sudan',
2951 'SR': 'Suriname',
2952 'SJ': 'Svalbard and Jan Mayen',
2953 'SZ': 'Swaziland',
2954 'SE': 'Sweden',
2955 'CH': 'Switzerland',
2956 'SY': 'Syrian Arab Republic',
2957 'TW': 'Taiwan, Province of China',
2958 'TJ': 'Tajikistan',
2959 'TZ': 'Tanzania, United Republic of',
2960 'TH': 'Thailand',
2961 'TL': 'Timor-Leste',
2962 'TG': 'Togo',
2963 'TK': 'Tokelau',
2964 'TO': 'Tonga',
2965 'TT': 'Trinidad and Tobago',
2966 'TN': 'Tunisia',
2967 'TR': 'Turkey',
2968 'TM': 'Turkmenistan',
2969 'TC': 'Turks and Caicos Islands',
2970 'TV': 'Tuvalu',
2971 'UG': 'Uganda',
2972 'UA': 'Ukraine',
2973 'AE': 'United Arab Emirates',
2974 'GB': 'United Kingdom',
2975 'US': 'United States',
2976 'UM': 'United States Minor Outlying Islands',
2977 'UY': 'Uruguay',
2978 'UZ': 'Uzbekistan',
2979 'VU': 'Vanuatu',
2980 'VE': 'Venezuela, Bolivarian Republic of',
2981 'VN': 'Viet Nam',
2982 'VG': 'Virgin Islands, British',
2983 'VI': 'Virgin Islands, U.S.',
2984 'WF': 'Wallis and Futuna',
2985 'EH': 'Western Sahara',
2986 'YE': 'Yemen',
2987 'ZM': 'Zambia',
2988 'ZW': 'Zimbabwe',
2989 }
2990
2991 @classmethod
2992 def short2full(cls, code):
2993 """Convert an ISO 3166-2 country code to the corresponding full name"""
2994 return cls._country_map.get(code.upper())
2995
2996
91410c9b 2997class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2998 def __init__(self, proxies=None):
2999 # Set default handlers
3000 for type in ('http', 'https'):
3001 setattr(self, '%s_open' % type,
3002 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3003 meth(r, proxy, type))
3004 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3005
91410c9b 3006 def proxy_open(self, req, proxy, type):
2461f79d 3007 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3008 if req_proxy is not None:
3009 proxy = req_proxy
2461f79d
PH
3010 del req.headers['Ytdl-request-proxy']
3011
3012 if proxy == '__noproxy__':
3013 return None # No Proxy
51fb4995 3014 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3015 req.add_header('Ytdl-socks-proxy', proxy)
3016 # youtube-dl's http/https handlers do wrapping the socket with socks
3017 return None
91410c9b
PH
3018 return compat_urllib_request.ProxyHandler.proxy_open(
3019 self, req, proxy, type)
5bc880b9
YCH
3020
3021
3022def ohdave_rsa_encrypt(data, exponent, modulus):
3023 '''
3024 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3025
3026 Input:
3027 data: data to encrypt, bytes-like object
3028 exponent, modulus: parameter e and N of RSA algorithm, both integer
3029 Output: hex string of encrypted data
3030
3031 Limitation: supports one block encryption only
3032 '''
3033
3034 payload = int(binascii.hexlify(data[::-1]), 16)
3035 encrypted = pow(payload, exponent, modulus)
3036 return '%x' % encrypted
81bdc8fd
YCH
3037
3038
5eb6bdce 3039def encode_base_n(num, n, table=None):
59f898b7 3040 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3041 if not table:
3042 table = FULL_TABLE[:n]
3043
5eb6bdce
YCH
3044 if n > len(table):
3045 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3046
3047 if num == 0:
3048 return table[0]
3049
81bdc8fd
YCH
3050 ret = ''
3051 while num:
3052 ret = table[num % n] + ret
3053 num = num // n
3054 return ret
f52354a8
YCH
3055
3056
3057def decode_packed_codes(code):
06b3fe29 3058 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3059 obfucasted_code, base, count, symbols = mobj.groups()
3060 base = int(base)
3061 count = int(count)
3062 symbols = symbols.split('|')
3063 symbol_table = {}
3064
3065 while count:
3066 count -= 1
5eb6bdce 3067 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3068 symbol_table[base_n_count] = symbols[count] or base_n_count
3069
3070 return re.sub(
3071 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3072 obfucasted_code)
e154c651 3073
3074
3075def parse_m3u8_attributes(attrib):
3076 info = {}
3077 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3078 if val.startswith('"'):
3079 val = val[1:-1]
3080 info[key] = val
3081 return info
1143535d
YCH
3082
3083
3084def urshift(val, n):
3085 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3086
3087
3088# Based on png2str() written by @gdkchan and improved by @yokrysty
3089# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3090def decode_png(png_data):
3091 # Reference: https://www.w3.org/TR/PNG/
3092 header = png_data[8:]
3093
3094 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3095 raise IOError('Not a valid PNG file.')
3096
3097 int_map = {1: '>B', 2: '>H', 4: '>I'}
3098 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3099
3100 chunks = []
3101
3102 while header:
3103 length = unpack_integer(header[:4])
3104 header = header[4:]
3105
3106 chunk_type = header[:4]
3107 header = header[4:]
3108
3109 chunk_data = header[:length]
3110 header = header[length:]
3111
3112 header = header[4:] # Skip CRC
3113
3114 chunks.append({
3115 'type': chunk_type,
3116 'length': length,
3117 'data': chunk_data
3118 })
3119
3120 ihdr = chunks[0]['data']
3121
3122 width = unpack_integer(ihdr[:4])
3123 height = unpack_integer(ihdr[4:8])
3124
3125 idat = b''
3126
3127 for chunk in chunks:
3128 if chunk['type'] == b'IDAT':
3129 idat += chunk['data']
3130
3131 if not idat:
3132 raise IOError('Unable to read PNG data.')
3133
3134 decompressed_data = bytearray(zlib.decompress(idat))
3135
3136 stride = width * 3
3137 pixels = []
3138
3139 def _get_pixel(idx):
3140 x = idx % stride
3141 y = idx // stride
3142 return pixels[y][x]
3143
3144 for y in range(height):
3145 basePos = y * (1 + stride)
3146 filter_type = decompressed_data[basePos]
3147
3148 current_row = []
3149
3150 pixels.append(current_row)
3151
3152 for x in range(stride):
3153 color = decompressed_data[1 + basePos + x]
3154 basex = y * stride + x
3155 left = 0
3156 up = 0
3157
3158 if x > 2:
3159 left = _get_pixel(basex - 3)
3160 if y > 0:
3161 up = _get_pixel(basex - stride)
3162
3163 if filter_type == 1: # Sub
3164 color = (color + left) & 0xff
3165 elif filter_type == 2: # Up
3166 color = (color + up) & 0xff
3167 elif filter_type == 3: # Average
3168 color = (color + ((left + up) >> 1)) & 0xff
3169 elif filter_type == 4: # Paeth
3170 a = left
3171 b = up
3172 c = 0
3173
3174 if x > 2 and y > 0:
3175 c = _get_pixel(basex - stride - 3)
3176
3177 p = a + b - c
3178
3179 pa = abs(p - a)
3180 pb = abs(p - b)
3181 pc = abs(p - c)
3182
3183 if pa <= pb and pa <= pc:
3184 color = (color + a) & 0xff
3185 elif pb <= pc:
3186 color = (color + b) & 0xff
3187 else:
3188 color = (color + c) & 0xff
3189
3190 current_row.append(color)
3191
3192 return width, height, pixels
efa97bdc
YCH
3193
3194
3195def write_xattr(path, key, value):
3196 # This mess below finds the best xattr tool for the job
3197 try:
3198 # try the pyxattr module...
3199 import xattr
3200
53a7e3d2
YCH
3201 if hasattr(xattr, 'set'): # pyxattr
3202 # Unicode arguments are not supported in python-pyxattr until
3203 # version 0.5.0
3204 # See https://github.com/rg3/youtube-dl/issues/5498
3205 pyxattr_required_version = '0.5.0'
3206 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3207 # TODO: fallback to CLI tools
3208 raise XAttrUnavailableError(
3209 'python-pyxattr is detected but is too old. '
3210 'youtube-dl requires %s or above while your version is %s. '
3211 'Falling back to other xattr implementations' % (
3212 pyxattr_required_version, xattr.__version__))
3213
3214 setxattr = xattr.set
3215 else: # xattr
3216 setxattr = xattr.setxattr
efa97bdc
YCH
3217
3218 try:
53a7e3d2 3219 setxattr(path, key, value)
efa97bdc
YCH
3220 except EnvironmentError as e:
3221 raise XAttrMetadataError(e.errno, e.strerror)
3222
3223 except ImportError:
3224 if compat_os_name == 'nt':
3225 # Write xattrs to NTFS Alternate Data Streams:
3226 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3227 assert ':' not in key
3228 assert os.path.exists(path)
3229
3230 ads_fn = path + ':' + key
3231 try:
3232 with open(ads_fn, 'wb') as f:
3233 f.write(value)
3234 except EnvironmentError as e:
3235 raise XAttrMetadataError(e.errno, e.strerror)
3236 else:
3237 user_has_setfattr = check_executable('setfattr', ['--version'])
3238 user_has_xattr = check_executable('xattr', ['-h'])
3239
3240 if user_has_setfattr or user_has_xattr:
3241
3242 value = value.decode('utf-8')
3243 if user_has_setfattr:
3244 executable = 'setfattr'
3245 opts = ['-n', key, '-v', value]
3246 elif user_has_xattr:
3247 executable = 'xattr'
3248 opts = ['-w', key, value]
3249
3250 cmd = ([encodeFilename(executable, True)] +
3251 [encodeArgument(o) for o in opts] +
3252 [encodeFilename(path, True)])
3253
3254 try:
3255 p = subprocess.Popen(
3256 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3257 except EnvironmentError as e:
3258 raise XAttrMetadataError(e.errno, e.strerror)
3259 stdout, stderr = p.communicate()
3260 stderr = stderr.decode('utf-8', 'replace')
3261 if p.returncode != 0:
3262 raise XAttrMetadataError(p.returncode, stderr)
3263
3264 else:
3265 # On Unix, and can't find pyxattr, setfattr, or xattr.
3266 if sys.platform.startswith('linux'):
3267 raise XAttrUnavailableError(
3268 "Couldn't find a tool to set the xattrs. "
3269 "Install either the python 'pyxattr' or 'xattr' "
3270 "modules, or the GNU 'attr' package "
3271 "(which contains the 'setfattr' tool).")
3272 else:
3273 raise XAttrUnavailableError(
3274 "Couldn't find a tool to set the xattrs. "
3275 "Install either the python 'xattr' module, "
3276 "or the 'xattr' binary.")