]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[naver] improve extraction(closes #8096)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
702ccf2d 46 compat_shlex_quote,
be4a824d 47 compat_socket_create_connection,
8c25f81b 48 compat_str,
edaa23f8 49 compat_struct_pack,
8c25f81b
PH
50 compat_urllib_error,
51 compat_urllib_parse,
15707c7e 52 compat_urllib_parse_urlencode,
8c25f81b 53 compat_urllib_parse_urlparse,
7581bfc9 54 compat_urllib_parse_unquote_plus,
8c25f81b
PH
55 compat_urllib_request,
56 compat_urlparse,
810c10ba 57 compat_xpath,
8c25f81b 58)
4644ac55 59
71aff188
YCH
60from .socks import (
61 ProxyType,
62 sockssocket,
63)
64
4644ac55 65
51fb4995
YCH
66def register_socks_protocols():
67 # "Register" SOCKS protocols
d5ae6bb5
YCH
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
468e2e92
FV
75# This is not clearly defined otherwise
76compiled_regex_type = type(re.compile(''))
77
3e669f36 78std_headers = {
15d10678 79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 84}
f427df17 85
5f6a1245 86
bf42a990
S
87NO_DEFAULT = object()
88
7105440c
YCH
89ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
a7aaa398
S
93KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
c587cbb7 108# needed for sanitizing filenames in restricted mode
c8827027 109ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 112
46f59e89
S
113DATE_FORMATS = (
114 '%d %B %Y',
115 '%d %b %Y',
116 '%B %d %Y',
117 '%b %d %Y',
118 '%b %dst %Y %I:%M',
119 '%b %dnd %Y %I:%M',
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
123 '%Y/%m/%d',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y-%m-%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S.%f',
127 '%d.%m.%Y %H:%M',
128 '%d.%m.%Y %H.%M',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
132 '%Y-%m-%dT%H:%M:%S',
133 '%Y-%m-%dT%H:%M:%S.%f',
134 '%Y-%m-%dT%H:%M',
135)
136
137DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138DATE_FORMATS_DAY_FIRST.extend([
139 '%d-%m-%Y',
140 '%d.%m.%Y',
141 '%d.%m.%y',
142 '%d/%m/%Y',
143 '%d/%m/%y',
144 '%d/%m/%Y %H:%M:%S',
145])
146
147DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148DATE_FORMATS_MONTH_FIRST.extend([
149 '%m-%d-%Y',
150 '%m.%d.%Y',
151 '%m/%d/%Y',
152 '%m/%d/%y',
153 '%m/%d/%Y %H:%M:%S',
154])
155
7105440c 156
d77c3dfd 157def preferredencoding():
59ae15a5 158 """Get preferred encoding.
d77c3dfd 159
59ae15a5
PH
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
28e614de 165 'TEST'.encode(pref)
70a1165b 166 except Exception:
59ae15a5 167 pref = 'UTF-8'
bae611f2 168
59ae15a5 169 return pref
d77c3dfd 170
f4bfd65f 171
181c8655 172def write_json_file(obj, fn):
1394646a 173 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 174
92120217 175 fn = encodeFilename(fn)
61ee5aeb 176 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
184 else:
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
187
73159f99
S
188 args = {
189 'suffix': '.tmp',
ec5f6016
JMF
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
73159f99
S
192 'delete': False,
193 }
194
181c8655
PH
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
73159f99 198 args['mode'] = 'wb'
181c8655 199 else:
73159f99
S
200 args.update({
201 'mode': 'w',
202 'encoding': 'utf-8',
203 })
204
c86b6142 205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
206
207 try:
208 with tf:
209 json.dump(obj, tf)
1394646a
IK
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
213 try:
214 os.unlink(fn)
215 except OSError:
216 pass
181c8655 217 os.rename(tf.name, fn)
70a1165b 218 except Exception:
181c8655
PH
219 try:
220 os.remove(tf.name)
221 except OSError:
222 pass
223 raise
224
225
226if sys.version_info >= (2, 7):
ee114368 227 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 228 """ Find the xpath xpath[@key=val] """
5d2354f1 229 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
231 return node.find(expr)
232else:
ee114368 233 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 234 for f in node.findall(compat_xpath(xpath)):
ee114368
S
235 if key not in f.attrib:
236 continue
237 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
238 return f
239 return None
240
d7e66d39
JMF
241# On python2.6 the xml.etree.ElementTree.Element methods don't support
242# the namespace parameter
5f6a1245
JW
243
244
d7e66d39
JMF
245def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
247 replaced = []
248 for c in components:
249 if len(c) == 1:
250 replaced.append(c[0])
251 else:
252 ns, tag = c
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
255
d77c3dfd 256
a41fb80c 257def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 258 def _find_xpath(xpath):
810c10ba 259 return node.find(compat_xpath(xpath))
578c0745
S
260
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
263 else:
264 for xp in xpath:
265 n = _find_xpath(xp)
266 if n is not None:
267 break
d74bebd5 268
8e636da4 269 if n is None:
bf42a990
S
270 if default is not NO_DEFAULT:
271 return default
272 elif fatal:
bf0ff932
PH
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
275 else:
276 return None
a41fb80c
S
277 return n
278
279
280def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
283 return n
284 if n.text is None:
285 if default is not NO_DEFAULT:
286 return default
287 elif fatal:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
290 else:
291 return None
292 return n.text
a41fb80c
S
293
294
295def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
297 if n is None:
298 if default is not NO_DEFAULT:
299 return default
300 elif fatal:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
303 else:
304 return None
305 return n.attrib[key]
bf0ff932
PH
306
307
9e6dd238 308def get_element_by_id(id, html):
43e8fafd 309 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 310 return get_element_by_attribute('id', id, html)
43e8fafd 311
12ea2f30 312
84c237fb
YCH
313def get_element_by_class(class_name, html):
314 return get_element_by_attribute(
315 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
316 html, escape_value=False)
317
318
319def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 320 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 321
84c237fb
YCH
322 value = re.escape(value) if escape_value else value
323
38285056
PH
324 m = re.search(r'''(?xs)
325 <([a-zA-Z0-9:._-]+)
abc97b5e 326 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 327 \s+%s=['"]?%s['"]?
abc97b5e 328 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
329 \s*>
330 (?P<content>.*?)
331 </\1>
84c237fb 332 ''' % (re.escape(attribute), value), html)
38285056
PH
333
334 if not m:
335 return None
336 res = m.group('content')
337
338 if res.startswith('"') or res.startswith("'"):
339 res = res[1:-1]
a921f407 340
38285056 341 return unescapeHTML(res)
a921f407 342
c5229f39 343
8bb56eee
BF
344class HTMLAttributeParser(compat_HTMLParser):
345 """Trivial HTML parser to gather the attributes for a single element"""
346 def __init__(self):
c5229f39 347 self.attrs = {}
8bb56eee
BF
348 compat_HTMLParser.__init__(self)
349
350 def handle_starttag(self, tag, attrs):
351 self.attrs = dict(attrs)
352
c5229f39 353
8bb56eee
BF
354def extract_attributes(html_element):
355 """Given a string for an HTML element such as
356 <el
357 a="foo" B="bar" c="&98;az" d=boz
358 empty= noval entity="&amp;"
359 sq='"' dq="'"
360 >
361 Decode and return a dictionary of attributes.
362 {
363 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
364 'empty': '', 'noval': None, 'entity': '&',
365 'sq': '"', 'dq': '\''
366 }.
367 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
368 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
369 """
370 parser = HTMLAttributeParser()
371 parser.feed(html_element)
372 parser.close()
373 return parser.attrs
9e6dd238 374
c5229f39 375
9e6dd238 376def clean_html(html):
59ae15a5 377 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
378
379 if html is None: # Convenience for sanitizing descriptions etc.
380 return html
381
59ae15a5
PH
382 # Newline vs <br />
383 html = html.replace('\n', ' ')
6b3aef80
FV
384 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
385 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
386 # Strip html tags
387 html = re.sub('<.*?>', '', html)
388 # Replace html entities
389 html = unescapeHTML(html)
7decf895 390 return html.strip()
9e6dd238
FV
391
392
d77c3dfd 393def sanitize_open(filename, open_mode):
59ae15a5
PH
394 """Try to open the given filename, and slightly tweak it if this fails.
395
396 Attempts to open the given filename. If this fails, it tries to change
397 the filename slightly, step by step, until it's either able to open it
398 or it fails and raises a final exception, like the standard open()
399 function.
400
401 It returns the tuple (stream, definitive_file_name).
402 """
403 try:
28e614de 404 if filename == '-':
59ae15a5
PH
405 if sys.platform == 'win32':
406 import msvcrt
407 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 408 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
409 stream = open(encodeFilename(filename), open_mode)
410 return (stream, filename)
411 except (IOError, OSError) as err:
f45c185f
PH
412 if err.errno in (errno.EACCES,):
413 raise
59ae15a5 414
f45c185f 415 # In case of error, try to remove win32 forbidden chars
d55de57b 416 alt_filename = sanitize_path(filename)
f45c185f
PH
417 if alt_filename == filename:
418 raise
419 else:
420 # An exception here should be caught in the caller
d55de57b 421 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 422 return (stream, alt_filename)
d77c3dfd
FV
423
424
425def timeconvert(timestr):
59ae15a5
PH
426 """Convert RFC 2822 defined time string into system timestamp"""
427 timestamp = None
428 timetuple = email.utils.parsedate_tz(timestr)
429 if timetuple is not None:
430 timestamp = email.utils.mktime_tz(timetuple)
431 return timestamp
1c469a94 432
5f6a1245 433
796173d0 434def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
435 """Sanitizes a string so it could be used as part of a filename.
436 If restricted is set, use a stricter subset of allowed characters.
796173d0 437 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
438 """
439 def replace_insane(char):
c587cbb7
AT
440 if restricted and char in ACCENT_CHARS:
441 return ACCENT_CHARS[char]
59ae15a5
PH
442 if char == '?' or ord(char) < 32 or ord(char) == 127:
443 return ''
444 elif char == '"':
445 return '' if restricted else '\''
446 elif char == ':':
447 return '_-' if restricted else ' -'
448 elif char in '\\/|*<>':
449 return '_'
627dcfff 450 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
451 return '_'
452 if restricted and ord(char) > 127:
453 return '_'
454 return char
455
2aeb06d6
PH
456 # Handle timestamps
457 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 458 result = ''.join(map(replace_insane, s))
796173d0
PH
459 if not is_id:
460 while '__' in result:
461 result = result.replace('__', '_')
462 result = result.strip('_')
463 # Common case of "Foreign band name - English song title"
464 if restricted and result.startswith('-_'):
465 result = result[2:]
5a42414b
PH
466 if result.startswith('-'):
467 result = '_' + result[len('-'):]
a7440261 468 result = result.lstrip('.')
796173d0
PH
469 if not result:
470 result = '_'
59ae15a5 471 return result
d77c3dfd 472
5f6a1245 473
a2aaf4db
S
474def sanitize_path(s):
475 """Sanitizes and normalizes path on Windows"""
476 if sys.platform != 'win32':
477 return s
be531ef1
S
478 drive_or_unc, _ = os.path.splitdrive(s)
479 if sys.version_info < (2, 7) and not drive_or_unc:
480 drive_or_unc, _ = os.path.splitunc(s)
481 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
482 if drive_or_unc:
a2aaf4db
S
483 norm_path.pop(0)
484 sanitized_path = [
c90d16cf 485 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 486 for path_part in norm_path]
be531ef1
S
487 if drive_or_unc:
488 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
489 return os.path.join(*sanitized_path)
490
491
67dda517
S
492# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
493# unwanted failures due to missing protocol
17bcc626
S
494def sanitize_url(url):
495 return 'http:%s' % url if url.startswith('//') else url
496
497
67dda517 498def sanitized_Request(url, *args, **kwargs):
17bcc626 499 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
500
501
d77c3dfd 502def orderedSet(iterable):
59ae15a5
PH
503 """ Remove all duplicates from the input iterable """
504 res = []
505 for el in iterable:
506 if el not in res:
507 res.append(el)
508 return res
d77c3dfd 509
912b38b4 510
55b2f099 511def _htmlentity_transform(entity_with_semicolon):
4e408e47 512 """Transforms an HTML entity to a character."""
55b2f099
YCH
513 entity = entity_with_semicolon[:-1]
514
4e408e47
PH
515 # Known non-numeric HTML entity
516 if entity in compat_html_entities.name2codepoint:
517 return compat_chr(compat_html_entities.name2codepoint[entity])
518
55b2f099
YCH
519 # TODO: HTML5 allows entities without a semicolon. For example,
520 # '&Eacuteric' should be decoded as 'Éric'.
521 if entity_with_semicolon in compat_html_entities_html5:
522 return compat_html_entities_html5[entity_with_semicolon]
523
91757b0f 524 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
525 if mobj is not None:
526 numstr = mobj.group(1)
28e614de 527 if numstr.startswith('x'):
4e408e47 528 base = 16
28e614de 529 numstr = '0%s' % numstr
4e408e47
PH
530 else:
531 base = 10
7aefc49c
S
532 # See https://github.com/rg3/youtube-dl/issues/7518
533 try:
534 return compat_chr(int(numstr, base))
535 except ValueError:
536 pass
4e408e47
PH
537
538 # Unknown entity in name, return its literal representation
7a3f0c00 539 return '&%s;' % entity
4e408e47
PH
540
541
d77c3dfd 542def unescapeHTML(s):
912b38b4
PH
543 if s is None:
544 return None
545 assert type(s) == compat_str
d77c3dfd 546
4e408e47 547 return re.sub(
55b2f099 548 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 549
8bf48f23 550
aa49acd1
S
551def get_subprocess_encoding():
552 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
553 # For subprocess calls, encode with locale encoding
554 # Refer to http://stackoverflow.com/a/9951851/35070
555 encoding = preferredencoding()
556 else:
557 encoding = sys.getfilesystemencoding()
558 if encoding is None:
559 encoding = 'utf-8'
560 return encoding
561
562
8bf48f23 563def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
564 """
565 @param s The name of the file
566 """
d77c3dfd 567
8bf48f23 568 assert type(s) == compat_str
d77c3dfd 569
59ae15a5
PH
570 # Python 3 has a Unicode API
571 if sys.version_info >= (3, 0):
572 return s
0f00efed 573
aa49acd1
S
574 # Pass '' directly to use Unicode APIs on Windows 2000 and up
575 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
576 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
577 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
578 return s
579
8ee239e9
YCH
580 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
581 if sys.platform.startswith('java'):
582 return s
583
aa49acd1
S
584 return s.encode(get_subprocess_encoding(), 'ignore')
585
586
587def decodeFilename(b, for_subprocess=False):
588
589 if sys.version_info >= (3, 0):
590 return b
591
592 if not isinstance(b, bytes):
593 return b
594
595 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 596
f07b74fc
PH
597
598def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
7af808a5 602 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
aa49acd1
S
607def decodeArgument(b):
608 return decodeFilename(b, True)
609
610
8271226a
PH
611def decodeOption(optval):
612 if optval is None:
613 return optval
614 if isinstance(optval, bytes):
615 optval = optval.decode(preferredencoding())
616
617 assert isinstance(optval, compat_str)
618 return optval
1c256f70 619
5f6a1245 620
4539dd30
PH
621def formatSeconds(secs):
622 if secs > 3600:
623 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
624 elif secs > 60:
625 return '%d:%02d' % (secs // 60, secs % 60)
626 else:
627 return '%d' % secs
628
a0ddb8a2 629
be4a824d
PH
630def make_HTTPS_handler(params, **kwargs):
631 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 632 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 633 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 634 if opts_no_check_certificate:
be5f2c19 635 context.check_hostname = False
0db261ba 636 context.verify_mode = ssl.CERT_NONE
a2366922 637 try:
be4a824d 638 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
639 except TypeError:
640 # Python 2.7.8
641 # (create_default_context present but HTTPSHandler has no context=)
642 pass
643
644 if sys.version_info < (3, 2):
d7932313 645 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 646 else: # Python < 3.4
d7932313 647 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 648 context.verify_mode = (ssl.CERT_NONE
dca08720 649 if opts_no_check_certificate
ea6d901e 650 else ssl.CERT_REQUIRED)
303b479e 651 context.set_default_verify_paths()
be4a824d 652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 653
732ea2f0 654
08f2a92c
JMF
655def bug_reports_message():
656 if ytdl_is_updateable():
657 update_cmd = 'type youtube-dl -U to update'
658 else:
659 update_cmd = 'see https://yt-dl.org/update on how to update'
660 msg = '; please report this issue on https://yt-dl.org/bug .'
661 msg += ' Make sure you are using the latest version; %s.' % update_cmd
662 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
663 return msg
664
665
1c256f70
PH
666class ExtractorError(Exception):
667 """Error during info extraction."""
5f6a1245 668
d11271dd 669 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
670 """ tb, if given, is the original traceback (so that it can be printed out).
671 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
672 """
673
674 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
675 expected = True
d11271dd
PH
676 if video_id is not None:
677 msg = video_id + ': ' + msg
410f3e73 678 if cause:
28e614de 679 msg += ' (caused by %r)' % cause
9a82b238 680 if not expected:
08f2a92c 681 msg += bug_reports_message()
1c256f70 682 super(ExtractorError, self).__init__(msg)
d5979c5d 683
1c256f70 684 self.traceback = tb
8cc83b8d 685 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 686 self.cause = cause
d11271dd 687 self.video_id = video_id
1c256f70 688
01951dda
PH
689 def format_traceback(self):
690 if self.traceback is None:
691 return None
28e614de 692 return ''.join(traceback.format_tb(self.traceback))
01951dda 693
1c256f70 694
416c7fcb
PH
695class UnsupportedError(ExtractorError):
696 def __init__(self, url):
697 super(UnsupportedError, self).__init__(
698 'Unsupported URL: %s' % url, expected=True)
699 self.url = url
700
701
55b3e45b
JMF
702class RegexNotFoundError(ExtractorError):
703 """Error when a regex didn't match"""
704 pass
705
706
d77c3dfd 707class DownloadError(Exception):
59ae15a5 708 """Download Error exception.
d77c3dfd 709
59ae15a5
PH
710 This exception may be thrown by FileDownloader objects if they are not
711 configured to continue on errors. They will contain the appropriate
712 error message.
713 """
5f6a1245 714
8cc83b8d
FV
715 def __init__(self, msg, exc_info=None):
716 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
717 super(DownloadError, self).__init__(msg)
718 self.exc_info = exc_info
d77c3dfd
FV
719
720
721class SameFileError(Exception):
59ae15a5 722 """Same File exception.
d77c3dfd 723
59ae15a5
PH
724 This exception will be thrown by FileDownloader objects if they detect
725 multiple files would have to be downloaded to the same file on disk.
726 """
727 pass
d77c3dfd
FV
728
729
730class PostProcessingError(Exception):
59ae15a5 731 """Post Processing exception.
d77c3dfd 732
59ae15a5
PH
733 This exception may be raised by PostProcessor's .run() method to
734 indicate an error in the postprocessing task.
735 """
5f6a1245 736
7851b379
PH
737 def __init__(self, msg):
738 self.msg = msg
d77c3dfd 739
5f6a1245 740
d77c3dfd 741class MaxDownloadsReached(Exception):
59ae15a5
PH
742 """ --max-downloads limit has been reached. """
743 pass
d77c3dfd
FV
744
745
746class UnavailableVideoError(Exception):
59ae15a5 747 """Unavailable Format exception.
d77c3dfd 748
59ae15a5
PH
749 This exception will be thrown when a video is requested
750 in a format that is not available for that video.
751 """
752 pass
d77c3dfd
FV
753
754
755class ContentTooShortError(Exception):
59ae15a5 756 """Content Too Short exception.
d77c3dfd 757
59ae15a5
PH
758 This exception may be raised by FileDownloader objects when a file they
759 download is too small for what the server announced first, indicating
760 the connection was probably interrupted.
761 """
d77c3dfd 762
59ae15a5 763 def __init__(self, downloaded, expected):
2c7ed247 764 # Both in bytes
59ae15a5
PH
765 self.downloaded = downloaded
766 self.expected = expected
d77c3dfd 767
5f6a1245 768
c5a59d93 769def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
770 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
771 # expected HTTP responses to meet HTTP/1.0 or later (see also
772 # https://github.com/rg3/youtube-dl/issues/6727)
773 if sys.version_info < (3, 0):
5a1a2e94 774 kwargs[b'strict'] = True
be4a824d
PH
775 hc = http_class(*args, **kwargs)
776 source_address = ydl_handler._params.get('source_address')
777 if source_address is not None:
778 sa = (source_address, 0)
779 if hasattr(hc, 'source_address'): # Python 2.7+
780 hc.source_address = sa
781 else: # Python 2.6
782 def _hc_connect(self, *args, **kwargs):
783 sock = compat_socket_create_connection(
784 (self.host, self.port), self.timeout, sa)
785 if is_https:
d7932313
PH
786 self.sock = ssl.wrap_socket(
787 sock, self.key_file, self.cert_file,
788 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
789 else:
790 self.sock = sock
791 hc.connect = functools.partial(_hc_connect, hc)
792
793 return hc
794
795
87f0e62d 796def handle_youtubedl_headers(headers):
992fc9d6
YCH
797 filtered_headers = headers
798
799 if 'Youtubedl-no-compression' in filtered_headers:
800 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 801 del filtered_headers['Youtubedl-no-compression']
87f0e62d 802
992fc9d6 803 return filtered_headers
87f0e62d
YCH
804
805
acebc9cd 806class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
807 """Handler for HTTP requests and responses.
808
809 This class, when installed with an OpenerDirector, automatically adds
810 the standard headers to every HTTP request and handles gzipped and
811 deflated responses from web servers. If compression is to be avoided in
812 a particular request, the original request in the program code only has
0424ec30 813 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
814 removed before making the real request.
815
816 Part of this code was copied from:
817
818 http://techknack.net/python-urllib2-handlers/
819
820 Andrew Rowls, the author of that code, agreed to release it to the
821 public domain.
822 """
823
be4a824d
PH
824 def __init__(self, params, *args, **kwargs):
825 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
826 self._params = params
827
828 def http_open(self, req):
71aff188
YCH
829 conn_class = compat_http_client.HTTPConnection
830
831 socks_proxy = req.headers.get('Ytdl-socks-proxy')
832 if socks_proxy:
833 conn_class = make_socks_conn_class(conn_class, socks_proxy)
834 del req.headers['Ytdl-socks-proxy']
835
be4a824d 836 return self.do_open(functools.partial(
71aff188 837 _create_http_connection, self, conn_class, False),
be4a824d
PH
838 req)
839
59ae15a5
PH
840 @staticmethod
841 def deflate(data):
842 try:
843 return zlib.decompress(data, -zlib.MAX_WBITS)
844 except zlib.error:
845 return zlib.decompress(data)
846
847 @staticmethod
848 def addinfourl_wrapper(stream, headers, url, code):
849 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
850 return compat_urllib_request.addinfourl(stream, headers, url, code)
851 ret = compat_urllib_request.addinfourl(stream, headers, url)
852 ret.code = code
853 return ret
854
acebc9cd 855 def http_request(self, req):
51f267d9
S
856 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
857 # always respected by websites, some tend to give out URLs with non percent-encoded
858 # non-ASCII characters (see telemb.py, ard.py [#3412])
859 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
860 # To work around aforementioned issue we will replace request's original URL with
861 # percent-encoded one
862 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
863 # the code of this workaround has been moved here from YoutubeDL.urlopen()
864 url = req.get_full_url()
865 url_escaped = escape_url(url)
866
867 # Substitute URL if any change after escaping
868 if url != url_escaped:
15d260eb 869 req = update_Request(req, url=url_escaped)
51f267d9 870
33ac271b 871 for h, v in std_headers.items():
3d5f7a39
JK
872 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
873 # The dict keys are capitalized because of this bug by urllib
874 if h.capitalize() not in req.headers:
33ac271b 875 req.add_header(h, v)
87f0e62d
YCH
876
877 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
878
879 if sys.version_info < (2, 7) and '#' in req.get_full_url():
880 # Python 2.6 is brain-dead when it comes to fragments
881 req._Request__original = req._Request__original.partition('#')[0]
882 req._Request__r_type = req._Request__r_type.partition('#')[0]
883
59ae15a5
PH
884 return req
885
acebc9cd 886 def http_response(self, req, resp):
59ae15a5
PH
887 old_resp = resp
888 # gzip
889 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
890 content = resp.read()
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
892 try:
893 uncompressed = io.BytesIO(gz.read())
894 except IOError as original_ioerror:
895 # There may be junk add the end of the file
896 # See http://stackoverflow.com/q/4928560/35070 for details
897 for i in range(1, 1024):
898 try:
899 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
900 uncompressed = io.BytesIO(gz.read())
901 except IOError:
902 continue
903 break
904 else:
905 raise original_ioerror
906 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 907 resp.msg = old_resp.msg
c047270c 908 del resp.headers['Content-encoding']
59ae15a5
PH
909 # deflate
910 if resp.headers.get('Content-encoding', '') == 'deflate':
911 gz = io.BytesIO(self.deflate(resp.read()))
912 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
913 resp.msg = old_resp.msg
c047270c 914 del resp.headers['Content-encoding']
ad729172
S
915 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
916 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
917 if 300 <= resp.code < 400:
918 location = resp.headers.get('Location')
919 if location:
920 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
921 if sys.version_info >= (3, 0):
922 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
923 else:
924 location = location.decode('utf-8')
5a4d9ddb
S
925 location_escaped = escape_url(location)
926 if location != location_escaped:
927 del resp.headers['Location']
9a4aec8b
YCH
928 if sys.version_info < (3, 0):
929 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 930 resp.headers['Location'] = location_escaped
59ae15a5 931 return resp
0f8d03f8 932
acebc9cd
PH
933 https_request = http_request
934 https_response = http_response
bf50b038 935
5de90176 936
71aff188
YCH
937def make_socks_conn_class(base_class, socks_proxy):
938 assert issubclass(base_class, (
939 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
940
941 url_components = compat_urlparse.urlparse(socks_proxy)
942 if url_components.scheme.lower() == 'socks5':
943 socks_type = ProxyType.SOCKS5
944 elif url_components.scheme.lower() in ('socks', 'socks4'):
945 socks_type = ProxyType.SOCKS4
51fb4995
YCH
946 elif url_components.scheme.lower() == 'socks4a':
947 socks_type = ProxyType.SOCKS4A
71aff188 948
cdd94c2e
YCH
949 def unquote_if_non_empty(s):
950 if not s:
951 return s
952 return compat_urllib_parse_unquote_plus(s)
953
71aff188
YCH
954 proxy_args = (
955 socks_type,
956 url_components.hostname, url_components.port or 1080,
957 True, # Remote DNS
cdd94c2e
YCH
958 unquote_if_non_empty(url_components.username),
959 unquote_if_non_empty(url_components.password),
71aff188
YCH
960 )
961
962 class SocksConnection(base_class):
963 def connect(self):
964 self.sock = sockssocket()
965 self.sock.setproxy(*proxy_args)
966 if type(self.timeout) in (int, float):
967 self.sock.settimeout(self.timeout)
968 self.sock.connect((self.host, self.port))
969
970 if isinstance(self, compat_http_client.HTTPSConnection):
971 if hasattr(self, '_context'): # Python > 2.6
972 self.sock = self._context.wrap_socket(
973 self.sock, server_hostname=self.host)
974 else:
975 self.sock = ssl.wrap_socket(self.sock)
976
977 return SocksConnection
978
979
be4a824d
PH
980class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
981 def __init__(self, params, https_conn_class=None, *args, **kwargs):
982 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
983 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
984 self._params = params
985
986 def https_open(self, req):
4f264c02 987 kwargs = {}
71aff188
YCH
988 conn_class = self._https_conn_class
989
4f264c02
JMF
990 if hasattr(self, '_context'): # python > 2.6
991 kwargs['context'] = self._context
992 if hasattr(self, '_check_hostname'): # python 3.x
993 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
994
995 socks_proxy = req.headers.get('Ytdl-socks-proxy')
996 if socks_proxy:
997 conn_class = make_socks_conn_class(conn_class, socks_proxy)
998 del req.headers['Ytdl-socks-proxy']
999
be4a824d 1000 return self.do_open(functools.partial(
71aff188 1001 _create_http_connection, self, conn_class, True),
4f264c02 1002 req, **kwargs)
be4a824d
PH
1003
1004
a6420bf5
S
1005class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1006 def __init__(self, cookiejar=None):
1007 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1008
1009 def http_response(self, request, response):
1010 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1011 # characters in Set-Cookie HTTP header of last response (see
1012 # https://github.com/rg3/youtube-dl/issues/6769).
1013 # In order to at least prevent crashing we will percent encode Set-Cookie
1014 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1015 # if sys.version_info < (3, 0) and response.headers:
1016 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1017 # set_cookie = response.headers.get(set_cookie_header)
1018 # if set_cookie:
1019 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1020 # if set_cookie != set_cookie_escaped:
1021 # del response.headers[set_cookie_header]
1022 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1023 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1024
1025 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1026 https_response = http_response
1027
1028
46f59e89
S
1029def extract_timezone(date_str):
1030 m = re.search(
1031 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1032 date_str)
1033 if not m:
1034 timezone = datetime.timedelta()
1035 else:
1036 date_str = date_str[:-len(m.group('tz'))]
1037 if not m.group('sign'):
1038 timezone = datetime.timedelta()
1039 else:
1040 sign = 1 if m.group('sign') == '+' else -1
1041 timezone = datetime.timedelta(
1042 hours=sign * int(m.group('hours')),
1043 minutes=sign * int(m.group('minutes')))
1044 return timezone, date_str
1045
1046
08b38d54 1047def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1048 """ Return a UNIX timestamp from the given date """
1049
1050 if date_str is None:
1051 return None
1052
52c3a6e4
S
1053 date_str = re.sub(r'\.[0-9]+', '', date_str)
1054
08b38d54 1055 if timezone is None:
46f59e89
S
1056 timezone, date_str = extract_timezone(date_str)
1057
52c3a6e4
S
1058 try:
1059 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1060 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1061 return calendar.timegm(dt.timetuple())
1062 except ValueError:
1063 pass
912b38b4
PH
1064
1065
46f59e89
S
1066def date_formats(day_first=True):
1067 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1068
1069
42bdd9d0 1070def unified_strdate(date_str, day_first=True):
bf50b038 1071 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1072
1073 if date_str is None:
1074 return None
bf50b038 1075 upload_date = None
5f6a1245 1076 # Replace commas
026fcc04 1077 date_str = date_str.replace(',', ' ')
42bdd9d0 1078 # Remove AM/PM + timezone
9bb8e0a3 1079 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1080 _, date_str = extract_timezone(date_str)
42bdd9d0 1081
46f59e89 1082 for expression in date_formats(day_first):
bf50b038
JMF
1083 try:
1084 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1085 except ValueError:
bf50b038 1086 pass
42393ce2
PH
1087 if upload_date is None:
1088 timetuple = email.utils.parsedate_tz(date_str)
1089 if timetuple:
c6b9cf05
S
1090 try:
1091 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1092 except ValueError:
1093 pass
6a750402
JMF
1094 if upload_date is not None:
1095 return compat_str(upload_date)
bf50b038 1096
5f6a1245 1097
46f59e89
S
1098def unified_timestamp(date_str, day_first=True):
1099 if date_str is None:
1100 return None
1101
1102 date_str = date_str.replace(',', ' ')
1103
1104 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1105 timezone, date_str = extract_timezone(date_str)
1106
1107 # Remove AM/PM + timezone
1108 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1109
1110 for expression in date_formats(day_first):
1111 try:
1112 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1113 return calendar.timegm(dt.timetuple())
1114 except ValueError:
1115 pass
1116 timetuple = email.utils.parsedate_tz(date_str)
1117 if timetuple:
1118 return calendar.timegm(timetuple.timetuple())
1119
1120
28e614de 1121def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1122 if url is None:
1123 return default_ext
9cb9a5df 1124 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1125 if re.match(r'^[A-Za-z0-9]+$', guess):
1126 return guess
a7aaa398
S
1127 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1128 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1129 return guess.rstrip('/')
73e79f2a 1130 else:
cbdbb766 1131 return default_ext
73e79f2a 1132
5f6a1245 1133
d4051a8e 1134def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1135 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1136
5f6a1245 1137
bd558525 1138def date_from_str(date_str):
37254abc
JMF
1139 """
1140 Return a datetime object from a string in the format YYYYMMDD or
1141 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1142 today = datetime.date.today()
f8795e10 1143 if date_str in ('now', 'today'):
37254abc 1144 return today
f8795e10
PH
1145 if date_str == 'yesterday':
1146 return today - datetime.timedelta(days=1)
37254abc
JMF
1147 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1148 if match is not None:
1149 sign = match.group('sign')
1150 time = int(match.group('time'))
1151 if sign == '-':
1152 time = -time
1153 unit = match.group('unit')
dfb1b146 1154 # A bad approximation?
37254abc
JMF
1155 if unit == 'month':
1156 unit = 'day'
1157 time *= 30
1158 elif unit == 'year':
1159 unit = 'day'
1160 time *= 365
1161 unit += 's'
1162 delta = datetime.timedelta(**{unit: time})
1163 return today + delta
611c1dd9 1164 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1165
1166
e63fc1be 1167def hyphenate_date(date_str):
1168 """
1169 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1170 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1171 if match is not None:
1172 return '-'.join(match.groups())
1173 else:
1174 return date_str
1175
5f6a1245 1176
bd558525
JMF
1177class DateRange(object):
1178 """Represents a time interval between two dates"""
5f6a1245 1179
bd558525
JMF
1180 def __init__(self, start=None, end=None):
1181 """start and end must be strings in the format accepted by date"""
1182 if start is not None:
1183 self.start = date_from_str(start)
1184 else:
1185 self.start = datetime.datetime.min.date()
1186 if end is not None:
1187 self.end = date_from_str(end)
1188 else:
1189 self.end = datetime.datetime.max.date()
37254abc 1190 if self.start > self.end:
bd558525 1191 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1192
bd558525
JMF
1193 @classmethod
1194 def day(cls, day):
1195 """Returns a range that only contains the given day"""
5f6a1245
JW
1196 return cls(day, day)
1197
bd558525
JMF
1198 def __contains__(self, date):
1199 """Check if the date is in the range"""
37254abc
JMF
1200 if not isinstance(date, datetime.date):
1201 date = date_from_str(date)
1202 return self.start <= date <= self.end
5f6a1245 1203
bd558525 1204 def __str__(self):
5f6a1245 1205 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1206
1207
1208def platform_name():
1209 """ Returns the platform name as a compat_str """
1210 res = platform.platform()
1211 if isinstance(res, bytes):
1212 res = res.decode(preferredencoding())
1213
1214 assert isinstance(res, compat_str)
1215 return res
c257baff
PH
1216
1217
b58ddb32
PH
1218def _windows_write_string(s, out):
1219 """ Returns True if the string was written using special methods,
1220 False if it has yet to be written out."""
1221 # Adapted from http://stackoverflow.com/a/3259271/35070
1222
1223 import ctypes
1224 import ctypes.wintypes
1225
1226 WIN_OUTPUT_IDS = {
1227 1: -11,
1228 2: -12,
1229 }
1230
a383a98a
PH
1231 try:
1232 fileno = out.fileno()
1233 except AttributeError:
1234 # If the output stream doesn't have a fileno, it's virtual
1235 return False
aa42e873
PH
1236 except io.UnsupportedOperation:
1237 # Some strange Windows pseudo files?
1238 return False
b58ddb32
PH
1239 if fileno not in WIN_OUTPUT_IDS:
1240 return False
1241
e2f89ec7 1242 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1243 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1244 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1245 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1246
e2f89ec7 1247 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1248 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1249 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1250 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1251 written = ctypes.wintypes.DWORD(0)
1252
611c1dd9 1253 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1254 FILE_TYPE_CHAR = 0x0002
1255 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1256 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1257 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1258 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1259 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1260 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1261
1262 def not_a_console(handle):
1263 if handle == INVALID_HANDLE_VALUE or handle is None:
1264 return True
8fb3ac36
PH
1265 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1266 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1267
1268 if not_a_console(h):
1269 return False
1270
d1b9c912
PH
1271 def next_nonbmp_pos(s):
1272 try:
1273 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1274 except StopIteration:
1275 return len(s)
1276
1277 while s:
1278 count = min(next_nonbmp_pos(s), 1024)
1279
b58ddb32 1280 ret = WriteConsoleW(
d1b9c912 1281 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1282 if ret == 0:
1283 raise OSError('Failed to write string')
d1b9c912
PH
1284 if not count: # We just wrote a non-BMP character
1285 assert written.value == 2
1286 s = s[1:]
1287 else:
1288 assert written.value > 0
1289 s = s[written.value:]
b58ddb32
PH
1290 return True
1291
1292
734f90bb 1293def write_string(s, out=None, encoding=None):
7459e3a2
PH
1294 if out is None:
1295 out = sys.stderr
8bf48f23 1296 assert type(s) == compat_str
7459e3a2 1297
b58ddb32
PH
1298 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1299 if _windows_write_string(s, out):
1300 return
1301
7459e3a2
PH
1302 if ('b' in getattr(out, 'mode', '') or
1303 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1304 byt = s.encode(encoding or preferredencoding(), 'ignore')
1305 out.write(byt)
1306 elif hasattr(out, 'buffer'):
1307 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1308 byt = s.encode(enc, 'ignore')
1309 out.buffer.write(byt)
1310 else:
8bf48f23 1311 out.write(s)
7459e3a2
PH
1312 out.flush()
1313
1314
48ea9cea
PH
1315def bytes_to_intlist(bs):
1316 if not bs:
1317 return []
1318 if isinstance(bs[0], int): # Python 3
1319 return list(bs)
1320 else:
1321 return [ord(c) for c in bs]
1322
c257baff 1323
cba892fa 1324def intlist_to_bytes(xs):
1325 if not xs:
1326 return b''
edaa23f8 1327 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1328
1329
c1c9a79c
PH
1330# Cross-platform file locking
1331if sys.platform == 'win32':
1332 import ctypes.wintypes
1333 import msvcrt
1334
1335 class OVERLAPPED(ctypes.Structure):
1336 _fields_ = [
1337 ('Internal', ctypes.wintypes.LPVOID),
1338 ('InternalHigh', ctypes.wintypes.LPVOID),
1339 ('Offset', ctypes.wintypes.DWORD),
1340 ('OffsetHigh', ctypes.wintypes.DWORD),
1341 ('hEvent', ctypes.wintypes.HANDLE),
1342 ]
1343
1344 kernel32 = ctypes.windll.kernel32
1345 LockFileEx = kernel32.LockFileEx
1346 LockFileEx.argtypes = [
1347 ctypes.wintypes.HANDLE, # hFile
1348 ctypes.wintypes.DWORD, # dwFlags
1349 ctypes.wintypes.DWORD, # dwReserved
1350 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1352 ctypes.POINTER(OVERLAPPED) # Overlapped
1353 ]
1354 LockFileEx.restype = ctypes.wintypes.BOOL
1355 UnlockFileEx = kernel32.UnlockFileEx
1356 UnlockFileEx.argtypes = [
1357 ctypes.wintypes.HANDLE, # hFile
1358 ctypes.wintypes.DWORD, # dwReserved
1359 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1360 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1361 ctypes.POINTER(OVERLAPPED) # Overlapped
1362 ]
1363 UnlockFileEx.restype = ctypes.wintypes.BOOL
1364 whole_low = 0xffffffff
1365 whole_high = 0x7fffffff
1366
1367 def _lock_file(f, exclusive):
1368 overlapped = OVERLAPPED()
1369 overlapped.Offset = 0
1370 overlapped.OffsetHigh = 0
1371 overlapped.hEvent = 0
1372 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1373 handle = msvcrt.get_osfhandle(f.fileno())
1374 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1375 whole_low, whole_high, f._lock_file_overlapped_p):
1376 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1377
1378 def _unlock_file(f):
1379 assert f._lock_file_overlapped_p
1380 handle = msvcrt.get_osfhandle(f.fileno())
1381 if not UnlockFileEx(handle, 0,
1382 whole_low, whole_high, f._lock_file_overlapped_p):
1383 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1384
1385else:
399a76e6
YCH
1386 # Some platforms, such as Jython, is missing fcntl
1387 try:
1388 import fcntl
c1c9a79c 1389
399a76e6
YCH
1390 def _lock_file(f, exclusive):
1391 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1392
399a76e6
YCH
1393 def _unlock_file(f):
1394 fcntl.flock(f, fcntl.LOCK_UN)
1395 except ImportError:
1396 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1397
1398 def _lock_file(f, exclusive):
1399 raise IOError(UNSUPPORTED_MSG)
1400
1401 def _unlock_file(f):
1402 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1403
1404
1405class locked_file(object):
1406 def __init__(self, filename, mode, encoding=None):
1407 assert mode in ['r', 'a', 'w']
1408 self.f = io.open(filename, mode, encoding=encoding)
1409 self.mode = mode
1410
1411 def __enter__(self):
1412 exclusive = self.mode != 'r'
1413 try:
1414 _lock_file(self.f, exclusive)
1415 except IOError:
1416 self.f.close()
1417 raise
1418 return self
1419
1420 def __exit__(self, etype, value, traceback):
1421 try:
1422 _unlock_file(self.f)
1423 finally:
1424 self.f.close()
1425
1426 def __iter__(self):
1427 return iter(self.f)
1428
1429 def write(self, *args):
1430 return self.f.write(*args)
1431
1432 def read(self, *args):
1433 return self.f.read(*args)
4eb7f1d1
JMF
1434
1435
4644ac55
S
1436def get_filesystem_encoding():
1437 encoding = sys.getfilesystemencoding()
1438 return encoding if encoding is not None else 'utf-8'
1439
1440
4eb7f1d1 1441def shell_quote(args):
a6a173c2 1442 quoted_args = []
4644ac55 1443 encoding = get_filesystem_encoding()
a6a173c2
JMF
1444 for a in args:
1445 if isinstance(a, bytes):
1446 # We may get a filename encoded with 'encodeFilename'
1447 a = a.decode(encoding)
1448 quoted_args.append(pipes.quote(a))
28e614de 1449 return ' '.join(quoted_args)
9d4660ca
PH
1450
1451
1452def smuggle_url(url, data):
1453 """ Pass additional data in a URL for internal use. """
1454
81953d1a
RA
1455 url, idata = unsmuggle_url(url, {})
1456 data.update(idata)
15707c7e 1457 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1458 {'__youtubedl_smuggle': json.dumps(data)})
1459 return url + '#' + sdata
9d4660ca
PH
1460
1461
79f82953 1462def unsmuggle_url(smug_url, default=None):
83e865a3 1463 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1464 return smug_url, default
28e614de
PH
1465 url, _, sdata = smug_url.rpartition('#')
1466 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1467 data = json.loads(jsond)
1468 return url, data
02dbf93f
PH
1469
1470
02dbf93f
PH
1471def format_bytes(bytes):
1472 if bytes is None:
28e614de 1473 return 'N/A'
02dbf93f
PH
1474 if type(bytes) is str:
1475 bytes = float(bytes)
1476 if bytes == 0.0:
1477 exponent = 0
1478 else:
1479 exponent = int(math.log(bytes, 1024.0))
28e614de 1480 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1481 converted = float(bytes) / float(1024 ** exponent)
28e614de 1482 return '%.2f%s' % (converted, suffix)
f53c966a 1483
1c088fa8 1484
fb47597b
S
1485def lookup_unit_table(unit_table, s):
1486 units_re = '|'.join(re.escape(u) for u in unit_table)
1487 m = re.match(
782b1b5b 1488 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1489 if not m:
1490 return None
1491 num_str = m.group('num').replace(',', '.')
1492 mult = unit_table[m.group('unit')]
1493 return int(float(num_str) * mult)
1494
1495
be64b5b0
PH
1496def parse_filesize(s):
1497 if s is None:
1498 return None
1499
dfb1b146 1500 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1501 # but we support those too
1502 _UNIT_TABLE = {
1503 'B': 1,
1504 'b': 1,
1505 'KiB': 1024,
1506 'KB': 1000,
1507 'kB': 1024,
1508 'Kb': 1000,
1509 'MiB': 1024 ** 2,
1510 'MB': 1000 ** 2,
1511 'mB': 1024 ** 2,
1512 'Mb': 1000 ** 2,
1513 'GiB': 1024 ** 3,
1514 'GB': 1000 ** 3,
1515 'gB': 1024 ** 3,
1516 'Gb': 1000 ** 3,
1517 'TiB': 1024 ** 4,
1518 'TB': 1000 ** 4,
1519 'tB': 1024 ** 4,
1520 'Tb': 1000 ** 4,
1521 'PiB': 1024 ** 5,
1522 'PB': 1000 ** 5,
1523 'pB': 1024 ** 5,
1524 'Pb': 1000 ** 5,
1525 'EiB': 1024 ** 6,
1526 'EB': 1000 ** 6,
1527 'eB': 1024 ** 6,
1528 'Eb': 1000 ** 6,
1529 'ZiB': 1024 ** 7,
1530 'ZB': 1000 ** 7,
1531 'zB': 1024 ** 7,
1532 'Zb': 1000 ** 7,
1533 'YiB': 1024 ** 8,
1534 'YB': 1000 ** 8,
1535 'yB': 1024 ** 8,
1536 'Yb': 1000 ** 8,
1537 }
1538
fb47597b
S
1539 return lookup_unit_table(_UNIT_TABLE, s)
1540
1541
1542def parse_count(s):
1543 if s is None:
be64b5b0
PH
1544 return None
1545
fb47597b
S
1546 s = s.strip()
1547
1548 if re.match(r'^[\d,.]+$', s):
1549 return str_to_int(s)
1550
1551 _UNIT_TABLE = {
1552 'k': 1000,
1553 'K': 1000,
1554 'm': 1000 ** 2,
1555 'M': 1000 ** 2,
1556 'kk': 1000 ** 2,
1557 'KK': 1000 ** 2,
1558 }
be64b5b0 1559
fb47597b 1560 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1561
2f7ae819 1562
caefb1de
PH
1563def month_by_name(name):
1564 """ Return the number of a month by (locale-independently) English name """
1565
caefb1de 1566 try:
7105440c
YCH
1567 return ENGLISH_MONTH_NAMES.index(name) + 1
1568 except ValueError:
1569 return None
1570
1571
1572def month_by_abbreviation(abbrev):
1573 """ Return the number of a month by (locale-independently) English
1574 abbreviations """
1575
1576 try:
1577 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1578 except ValueError:
1579 return None
18258362
JMF
1580
1581
5aafe895 1582def fix_xml_ampersands(xml_str):
18258362 1583 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1584 return re.sub(
1585 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1586 '&amp;',
5aafe895 1587 xml_str)
e3946f98
PH
1588
1589
1590def setproctitle(title):
8bf48f23 1591 assert isinstance(title, compat_str)
c1c05c67
YCH
1592
1593 # ctypes in Jython is not complete
1594 # http://bugs.jython.org/issue2148
1595 if sys.platform.startswith('java'):
1596 return
1597
e3946f98 1598 try:
611c1dd9 1599 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1600 except OSError:
1601 return
6eefe533
PH
1602 title_bytes = title.encode('utf-8')
1603 buf = ctypes.create_string_buffer(len(title_bytes))
1604 buf.value = title_bytes
e3946f98 1605 try:
6eefe533 1606 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1607 except AttributeError:
1608 return # Strange libc, just skip this
d7dda168
PH
1609
1610
1611def remove_start(s, start):
46bc9b7d 1612 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1613
1614
2b9faf55 1615def remove_end(s, end):
46bc9b7d 1616 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1617
1618
31b2051e
S
1619def remove_quotes(s):
1620 if s is None or len(s) < 2:
1621 return s
1622 for quote in ('"', "'", ):
1623 if s[0] == quote and s[-1] == quote:
1624 return s[1:-1]
1625 return s
1626
1627
29eb5174 1628def url_basename(url):
9b8aaeed 1629 path = compat_urlparse.urlparse(url).path
28e614de 1630 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1631
1632
1633class HEADRequest(compat_urllib_request.Request):
1634 def get_method(self):
611c1dd9 1635 return 'HEAD'
7217e148
PH
1636
1637
95cf60e8
S
1638class PUTRequest(compat_urllib_request.Request):
1639 def get_method(self):
1640 return 'PUT'
1641
1642
9732d77e 1643def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1644 if get_attr:
1645 if v is not None:
1646 v = getattr(v, get_attr, None)
9572013d
PH
1647 if v == '':
1648 v = None
1812afb7
S
1649 if v is None:
1650 return default
1651 try:
1652 return int(v) * invscale // scale
1653 except ValueError:
af98f8ff 1654 return default
9732d77e 1655
9572013d 1656
40a90862
JMF
1657def str_or_none(v, default=None):
1658 return default if v is None else compat_str(v)
1659
9732d77e
PH
1660
1661def str_to_int(int_str):
48d4681e 1662 """ A more relaxed version of int_or_none """
9732d77e
PH
1663 if int_str is None:
1664 return None
28e614de 1665 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1666 return int(int_str)
608d11f5
PH
1667
1668
9732d77e 1669def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1670 if v is None:
1671 return default
1672 try:
1673 return float(v) * invscale / scale
1674 except ValueError:
1675 return default
43f775e4
PH
1676
1677
b72b4431
S
1678def strip_or_none(v):
1679 return None if v is None else v.strip()
1680
1681
608d11f5 1682def parse_duration(s):
8f9312c3 1683 if not isinstance(s, compat_basestring):
608d11f5
PH
1684 return None
1685
ca7b3246
S
1686 s = s.strip()
1687
acaff495 1688 days, hours, mins, secs, ms = [None] * 5
1689 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1690 if m:
1691 days, hours, mins, secs, ms = m.groups()
1692 else:
1693 m = re.match(
1694 r'''(?ix)(?:P?T)?
8f4b58d7 1695 (?:
acaff495 1696 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1697 )?
acaff495 1698 (?:
1699 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1700 )?
1701 (?:
1702 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1703 )?
1704 (?:
1705 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1706 )?$''', s)
1707 if m:
1708 days, hours, mins, secs, ms = m.groups()
1709 else:
1710 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1711 if m:
1712 hours, mins = m.groups()
1713 else:
1714 return None
1715
1716 duration = 0
1717 if secs:
1718 duration += float(secs)
1719 if mins:
1720 duration += float(mins) * 60
1721 if hours:
1722 duration += float(hours) * 60 * 60
1723 if days:
1724 duration += float(days) * 24 * 60 * 60
1725 if ms:
1726 duration += float(ms)
1727 return duration
91d7d0b3
JMF
1728
1729
e65e4c88 1730def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1731 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1732 return (
1733 '{0}.{1}{2}'.format(name, ext, real_ext)
1734 if not expected_real_ext or real_ext[1:] == expected_real_ext
1735 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1736
1737
b3ed15b7
S
1738def replace_extension(filename, ext, expected_real_ext=None):
1739 name, real_ext = os.path.splitext(filename)
1740 return '{0}.{1}'.format(
1741 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1742 ext)
1743
1744
d70ad093
PH
1745def check_executable(exe, args=[]):
1746 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1747 args can be a list of arguments for a short output (like -version) """
1748 try:
1749 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1750 except OSError:
1751 return False
1752 return exe
b7ab0590
PH
1753
1754
95807118 1755def get_exe_version(exe, args=['--version'],
cae97f65 1756 version_re=None, unrecognized='present'):
95807118
PH
1757 """ Returns the version of the specified executable,
1758 or False if the executable is not present """
1759 try:
cae97f65 1760 out, _ = subprocess.Popen(
54116803 1761 [encodeArgument(exe)] + args,
95807118
PH
1762 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1763 except OSError:
1764 return False
cae97f65
PH
1765 if isinstance(out, bytes): # Python 2.x
1766 out = out.decode('ascii', 'ignore')
1767 return detect_exe_version(out, version_re, unrecognized)
1768
1769
1770def detect_exe_version(output, version_re=None, unrecognized='present'):
1771 assert isinstance(output, compat_str)
1772 if version_re is None:
1773 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1774 m = re.search(version_re, output)
95807118
PH
1775 if m:
1776 return m.group(1)
1777 else:
1778 return unrecognized
1779
1780
b7ab0590 1781class PagedList(object):
dd26ced1
PH
1782 def __len__(self):
1783 # This is only useful for tests
1784 return len(self.getslice())
1785
9c44d242
PH
1786
1787class OnDemandPagedList(PagedList):
b95dc034 1788 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1789 self._pagefunc = pagefunc
1790 self._pagesize = pagesize
b95dc034
YCH
1791 self._use_cache = use_cache
1792 if use_cache:
1793 self._cache = {}
9c44d242 1794
b7ab0590
PH
1795 def getslice(self, start=0, end=None):
1796 res = []
1797 for pagenum in itertools.count(start // self._pagesize):
1798 firstid = pagenum * self._pagesize
1799 nextfirstid = pagenum * self._pagesize + self._pagesize
1800 if start >= nextfirstid:
1801 continue
1802
b95dc034
YCH
1803 page_results = None
1804 if self._use_cache:
1805 page_results = self._cache.get(pagenum)
1806 if page_results is None:
1807 page_results = list(self._pagefunc(pagenum))
1808 if self._use_cache:
1809 self._cache[pagenum] = page_results
b7ab0590
PH
1810
1811 startv = (
1812 start % self._pagesize
1813 if firstid <= start < nextfirstid
1814 else 0)
1815
1816 endv = (
1817 ((end - 1) % self._pagesize) + 1
1818 if (end is not None and firstid <= end <= nextfirstid)
1819 else None)
1820
1821 if startv != 0 or endv is not None:
1822 page_results = page_results[startv:endv]
1823 res.extend(page_results)
1824
1825 # A little optimization - if current page is not "full", ie. does
1826 # not contain page_size videos then we can assume that this page
1827 # is the last one - there are no more ids on further pages -
1828 # i.e. no need to query again.
1829 if len(page_results) + startv < self._pagesize:
1830 break
1831
1832 # If we got the whole page, but the next page is not interesting,
1833 # break out early as well
1834 if end == nextfirstid:
1835 break
1836 return res
81c2f20b
PH
1837
1838
9c44d242
PH
1839class InAdvancePagedList(PagedList):
1840 def __init__(self, pagefunc, pagecount, pagesize):
1841 self._pagefunc = pagefunc
1842 self._pagecount = pagecount
1843 self._pagesize = pagesize
1844
1845 def getslice(self, start=0, end=None):
1846 res = []
1847 start_page = start // self._pagesize
1848 end_page = (
1849 self._pagecount if end is None else (end // self._pagesize + 1))
1850 skip_elems = start - start_page * self._pagesize
1851 only_more = None if end is None else end - start
1852 for pagenum in range(start_page, end_page):
1853 page = list(self._pagefunc(pagenum))
1854 if skip_elems:
1855 page = page[skip_elems:]
1856 skip_elems = None
1857 if only_more is not None:
1858 if len(page) < only_more:
1859 only_more -= len(page)
1860 else:
1861 page = page[:only_more]
1862 res.extend(page)
1863 break
1864 res.extend(page)
1865 return res
1866
1867
81c2f20b 1868def uppercase_escape(s):
676eb3f2 1869 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1870 return re.sub(
a612753d 1871 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1872 lambda m: unicode_escape(m.group(0))[0],
1873 s)
0fe2ff78
YCH
1874
1875
1876def lowercase_escape(s):
1877 unicode_escape = codecs.getdecoder('unicode_escape')
1878 return re.sub(
1879 r'\\u[0-9a-fA-F]{4}',
1880 lambda m: unicode_escape(m.group(0))[0],
1881 s)
b53466e1 1882
d05cfe06
S
1883
1884def escape_rfc3986(s):
1885 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1886 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1887 s = s.encode('utf-8')
ecc0c5ee 1888 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1889
1890
1891def escape_url(url):
1892 """Escape URL as suggested by RFC 3986"""
1893 url_parsed = compat_urllib_parse_urlparse(url)
1894 return url_parsed._replace(
efbed08d 1895 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1896 path=escape_rfc3986(url_parsed.path),
1897 params=escape_rfc3986(url_parsed.params),
1898 query=escape_rfc3986(url_parsed.query),
1899 fragment=escape_rfc3986(url_parsed.fragment)
1900 ).geturl()
1901
62e609ab
PH
1902
1903def read_batch_urls(batch_fd):
1904 def fixup(url):
1905 if not isinstance(url, compat_str):
1906 url = url.decode('utf-8', 'replace')
28e614de 1907 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1908 if url.startswith(BOM_UTF8):
1909 url = url[len(BOM_UTF8):]
1910 url = url.strip()
1911 if url.startswith(('#', ';', ']')):
1912 return False
1913 return url
1914
1915 with contextlib.closing(batch_fd) as fd:
1916 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1917
1918
1919def urlencode_postdata(*args, **kargs):
15707c7e 1920 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1921
1922
38f9ef31 1923def update_url_query(url, query):
cacd9966
YCH
1924 if not query:
1925 return url
38f9ef31 1926 parsed_url = compat_urlparse.urlparse(url)
1927 qs = compat_parse_qs(parsed_url.query)
1928 qs.update(query)
1929 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1930 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1931
8e60dc75 1932
ed0291d1
S
1933def update_Request(req, url=None, data=None, headers={}, query={}):
1934 req_headers = req.headers.copy()
1935 req_headers.update(headers)
1936 req_data = data or req.data
1937 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
1938 req_get_method = req.get_method()
1939 if req_get_method == 'HEAD':
1940 req_type = HEADRequest
1941 elif req_get_method == 'PUT':
1942 req_type = PUTRequest
1943 else:
1944 req_type = compat_urllib_request.Request
ed0291d1
S
1945 new_req = req_type(
1946 req_url, data=req_data, headers=req_headers,
1947 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1948 if hasattr(req, 'timeout'):
1949 new_req.timeout = req.timeout
1950 return new_req
1951
1952
86296ad2 1953def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1954 if isinstance(key_or_keys, (list, tuple)):
1955 for key in key_or_keys:
86296ad2
S
1956 if key not in d or d[key] is None or skip_false_values and not d[key]:
1957 continue
1958 return d[key]
cbecc9b9
S
1959 return default
1960 return d.get(key_or_keys, default)
1961
1962
329ca3be
S
1963def try_get(src, getter, expected_type=None):
1964 try:
1965 v = getter(src)
1966 except (AttributeError, KeyError, TypeError, IndexError):
1967 pass
1968 else:
1969 if expected_type is None or isinstance(v, expected_type):
1970 return v
1971
1972
8e60dc75
S
1973def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1974 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1975
16392824 1976
a1a530b0
PH
1977US_RATINGS = {
1978 'G': 0,
1979 'PG': 10,
1980 'PG-13': 13,
1981 'R': 16,
1982 'NC': 18,
1983}
fac55558
PH
1984
1985
146c80e2
S
1986def parse_age_limit(s):
1987 if s is None:
d838b1bd 1988 return None
146c80e2 1989 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1990 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1991
1992
fac55558 1993def strip_jsonp(code):
609a61e3 1994 return re.sub(
5950cb1d 1995 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1996
1997
e05f6939
PH
1998def js_to_json(code):
1999 def fix_kv(m):
e7b6d122
PH
2000 v = m.group(0)
2001 if v in ('true', 'false', 'null'):
2002 return v
bd1e4844 2003 elif v.startswith('/*') or v == ',':
2004 return ""
2005
2006 if v[0] in ("'", '"'):
2007 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2008 '"': '\\"',
bd1e4844 2009 "\\'": "'",
2010 '\\\n': '',
2011 '\\x': '\\u00',
2012 }.get(m.group(0), m.group(0)), v[1:-1])
2013
89ac4a19 2014 INTEGER_TABLE = (
cda6d47a
S
2015 (r'^0[xX][0-9a-fA-F]+', 16),
2016 (r'^0+[0-7]+', 8),
89ac4a19
S
2017 )
2018
2019 for regex, base in INTEGER_TABLE:
2020 im = re.match(regex, v)
2021 if im:
cda6d47a 2022 i = int(im.group(0), base)
89ac4a19
S
2023 return '"%d":' % i if v.endswith(':') else '%d' % i
2024
e7b6d122 2025 return '"%s"' % v
e05f6939 2026
bd1e4844 2027 return re.sub(r'''(?sx)
2028 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2029 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2030 /\*.*?\*/|,(?=\s*[\]}])|
2031 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2032 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2033 [0-9]+(?=\s*:)
e05f6939 2034 ''', fix_kv, code)
e05f6939
PH
2035
2036
478c2c61
PH
2037def qualities(quality_ids):
2038 """ Get a numeric quality value out of a list of possible values """
2039 def q(qid):
2040 try:
2041 return quality_ids.index(qid)
2042 except ValueError:
2043 return -1
2044 return q
2045
acd69589
PH
2046
2047DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2048
a020a0dc
PH
2049
2050def limit_length(s, length):
2051 """ Add ellipses to overly long strings """
2052 if s is None:
2053 return None
2054 ELLIPSES = '...'
2055 if len(s) > length:
2056 return s[:length - len(ELLIPSES)] + ELLIPSES
2057 return s
48844745
PH
2058
2059
2060def version_tuple(v):
5f9b8394 2061 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2062
2063
2064def is_outdated_version(version, limit, assume_new=True):
2065 if not version:
2066 return not assume_new
2067 try:
2068 return version_tuple(version) < version_tuple(limit)
2069 except ValueError:
2070 return not assume_new
732ea2f0
PH
2071
2072
2073def ytdl_is_updateable():
2074 """ Returns if youtube-dl can be updated with -U """
2075 from zipimport import zipimporter
2076
2077 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2078
2079
2080def args_to_str(args):
2081 # Get a short string representation for a subprocess command
702ccf2d 2082 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2083
2084
9b9c5355 2085def error_to_compat_str(err):
fdae2358
S
2086 err_str = str(err)
2087 # On python 2 error byte string must be decoded with proper
2088 # encoding rather than ascii
2089 if sys.version_info[0] < 3:
2090 err_str = err_str.decode(preferredencoding())
2091 return err_str
2092
2093
c460bdd5 2094def mimetype2ext(mt):
eb9ee194
S
2095 if mt is None:
2096 return None
2097
765ac263
JMF
2098 ext = {
2099 'audio/mp4': 'm4a',
6c33d24b
YCH
2100 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2101 # it's the most popular one
2102 'audio/mpeg': 'mp3',
765ac263
JMF
2103 }.get(mt)
2104 if ext is not None:
2105 return ext
2106
c460bdd5 2107 _, _, res = mt.rpartition('/')
b4173f15 2108 res = res.lower()
c460bdd5
PH
2109
2110 return {
f6861ec9 2111 '3gpp': '3gp',
cafcf657 2112 'smptett+xml': 'tt',
2113 'srt': 'srt',
2114 'ttaf+xml': 'dfxp',
a0d8d704 2115 'ttml+xml': 'ttml',
cafcf657 2116 'vtt': 'vtt',
f6861ec9 2117 'x-flv': 'flv',
a0d8d704
YCH
2118 'x-mp4-fragmented': 'mp4',
2119 'x-ms-wmv': 'wmv',
b4173f15
RA
2120 'mpegurl': 'm3u8',
2121 'x-mpegurl': 'm3u8',
2122 'vnd.apple.mpegurl': 'm3u8',
2123 'dash+xml': 'mpd',
2124 'f4m': 'f4m',
2125 'f4m+xml': 'f4m',
f164b971 2126 'hds+xml': 'f4m',
e910fe2f 2127 'vnd.ms-sstr+xml': 'ism',
c460bdd5
PH
2128 }.get(res, res)
2129
2130
4f3c5e06 2131def parse_codecs(codecs_str):
2132 # http://tools.ietf.org/html/rfc6381
2133 if not codecs_str:
2134 return {}
2135 splited_codecs = list(filter(None, map(
2136 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2137 vcodec, acodec = None, None
2138 for full_codec in splited_codecs:
2139 codec = full_codec.split('.')[0]
2140 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2141 if not vcodec:
2142 vcodec = full_codec
2143 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2144 if not acodec:
2145 acodec = full_codec
2146 else:
2147 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2148 if not vcodec and not acodec:
2149 if len(splited_codecs) == 2:
2150 return {
2151 'vcodec': vcodec,
2152 'acodec': acodec,
2153 }
2154 elif len(splited_codecs) == 1:
2155 return {
2156 'vcodec': 'none',
2157 'acodec': vcodec,
2158 }
2159 else:
2160 return {
2161 'vcodec': vcodec or 'none',
2162 'acodec': acodec or 'none',
2163 }
2164 return {}
2165
2166
2ccd1b10 2167def urlhandle_detect_ext(url_handle):
79298173 2168 getheader = url_handle.headers.get
2ccd1b10 2169
b55ee18f
PH
2170 cd = getheader('Content-Disposition')
2171 if cd:
2172 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2173 if m:
2174 e = determine_ext(m.group('filename'), default_ext=None)
2175 if e:
2176 return e
2177
c460bdd5 2178 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2179
2180
1e399778
YCH
2181def encode_data_uri(data, mime_type):
2182 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2183
2184
05900629 2185def age_restricted(content_limit, age_limit):
6ec6cb4e 2186 """ Returns True iff the content should be blocked """
05900629
PH
2187
2188 if age_limit is None: # No limit set
2189 return False
2190 if content_limit is None:
2191 return False # Content available for everyone
2192 return age_limit < content_limit
61ca9a80
PH
2193
2194
2195def is_html(first_bytes):
2196 """ Detect whether a file contains HTML by examining its first bytes. """
2197
2198 BOMS = [
2199 (b'\xef\xbb\xbf', 'utf-8'),
2200 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2201 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2202 (b'\xff\xfe', 'utf-16-le'),
2203 (b'\xfe\xff', 'utf-16-be'),
2204 ]
2205 for bom, enc in BOMS:
2206 if first_bytes.startswith(bom):
2207 s = first_bytes[len(bom):].decode(enc, 'replace')
2208 break
2209 else:
2210 s = first_bytes.decode('utf-8', 'replace')
2211
2212 return re.match(r'^\s*<', s)
a055469f
PH
2213
2214
2215def determine_protocol(info_dict):
2216 protocol = info_dict.get('protocol')
2217 if protocol is not None:
2218 return protocol
2219
2220 url = info_dict['url']
2221 if url.startswith('rtmp'):
2222 return 'rtmp'
2223 elif url.startswith('mms'):
2224 return 'mms'
2225 elif url.startswith('rtsp'):
2226 return 'rtsp'
2227
2228 ext = determine_ext(url)
2229 if ext == 'm3u8':
2230 return 'm3u8'
2231 elif ext == 'f4m':
2232 return 'f4m'
2233
2234 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2235
2236
2237def render_table(header_row, data):
2238 """ Render a list of rows, each as a list of values """
2239 table = [header_row] + data
2240 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2241 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2242 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2243
2244
2245def _match_one(filter_part, dct):
2246 COMPARISON_OPERATORS = {
2247 '<': operator.lt,
2248 '<=': operator.le,
2249 '>': operator.gt,
2250 '>=': operator.ge,
2251 '=': operator.eq,
2252 '!=': operator.ne,
2253 }
2254 operator_rex = re.compile(r'''(?x)\s*
2255 (?P<key>[a-z_]+)
2256 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2257 (?:
2258 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2259 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2260 )
2261 \s*$
2262 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2263 m = operator_rex.search(filter_part)
2264 if m:
2265 op = COMPARISON_OPERATORS[m.group('op')]
2266 if m.group('strval') is not None:
2267 if m.group('op') not in ('=', '!='):
2268 raise ValueError(
2269 'Operator %s does not support string values!' % m.group('op'))
2270 comparison_value = m.group('strval')
2271 else:
2272 try:
2273 comparison_value = int(m.group('intval'))
2274 except ValueError:
2275 comparison_value = parse_filesize(m.group('intval'))
2276 if comparison_value is None:
2277 comparison_value = parse_filesize(m.group('intval') + 'B')
2278 if comparison_value is None:
2279 raise ValueError(
2280 'Invalid integer value %r in filter part %r' % (
2281 m.group('intval'), filter_part))
2282 actual_value = dct.get(m.group('key'))
2283 if actual_value is None:
2284 return m.group('none_inclusive')
2285 return op(actual_value, comparison_value)
2286
2287 UNARY_OPERATORS = {
2288 '': lambda v: v is not None,
2289 '!': lambda v: v is None,
2290 }
2291 operator_rex = re.compile(r'''(?x)\s*
2292 (?P<op>%s)\s*(?P<key>[a-z_]+)
2293 \s*$
2294 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2295 m = operator_rex.search(filter_part)
2296 if m:
2297 op = UNARY_OPERATORS[m.group('op')]
2298 actual_value = dct.get(m.group('key'))
2299 return op(actual_value)
2300
2301 raise ValueError('Invalid filter part %r' % filter_part)
2302
2303
2304def match_str(filter_str, dct):
2305 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2306
2307 return all(
2308 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2309
2310
2311def match_filter_func(filter_str):
2312 def _match_func(info_dict):
2313 if match_str(filter_str, info_dict):
2314 return None
2315 else:
2316 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2317 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2318 return _match_func
91410c9b
PH
2319
2320
bf6427d2
YCH
2321def parse_dfxp_time_expr(time_expr):
2322 if not time_expr:
d631d5f9 2323 return
bf6427d2
YCH
2324
2325 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2326 if mobj:
2327 return float(mobj.group('time_offset'))
2328
db2fe38b 2329 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2330 if mobj:
db2fe38b 2331 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2332
2333
c1c924ab
YCH
2334def srt_subtitles_timecode(seconds):
2335 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2336
2337
2338def dfxp2srt(dfxp_data):
4e335771
YCH
2339 _x = functools.partial(xpath_with_ns, ns_map={
2340 'ttml': 'http://www.w3.org/ns/ttml',
2341 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2342 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2343 })
bf6427d2 2344
87de7069 2345 class TTMLPElementParser(object):
2b14cb56 2346 out = ''
bf6427d2 2347
2b14cb56 2348 def start(self, tag, attrib):
2349 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2350 self.out += '\n'
bf6427d2 2351
2b14cb56 2352 def end(self, tag):
2353 pass
bf6427d2 2354
2b14cb56 2355 def data(self, data):
2356 self.out += data
2357
2358 def close(self):
2359 return self.out.strip()
2360
2361 def parse_node(node):
2362 target = TTMLPElementParser()
2363 parser = xml.etree.ElementTree.XMLParser(target=target)
2364 parser.feed(xml.etree.ElementTree.tostring(node))
2365 return parser.close()
bf6427d2 2366
36e6f62c 2367 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2368 out = []
5bf28d78 2369 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2370
2371 if not paras:
2372 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2373
2374 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2375 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2376 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2377 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2378 if begin_time is None:
2379 continue
7dff0363 2380 if not end_time:
d631d5f9
YCH
2381 if not dur:
2382 continue
2383 end_time = begin_time + dur
bf6427d2
YCH
2384 out.append('%d\n%s --> %s\n%s\n\n' % (
2385 index,
c1c924ab
YCH
2386 srt_subtitles_timecode(begin_time),
2387 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2388 parse_node(para)))
2389
2390 return ''.join(out)
2391
2392
66e289ba
S
2393def cli_option(params, command_option, param):
2394 param = params.get(param)
2395 return [command_option, param] if param is not None else []
2396
2397
2398def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2399 param = params.get(param)
2400 assert isinstance(param, bool)
2401 if separator:
2402 return [command_option + separator + (true_value if param else false_value)]
2403 return [command_option, true_value if param else false_value]
2404
2405
2406def cli_valueless_option(params, command_option, param, expected_value=True):
2407 param = params.get(param)
2408 return [command_option] if param == expected_value else []
2409
2410
2411def cli_configuration_args(params, param, default=[]):
2412 ex_args = params.get(param)
2413 if ex_args is None:
2414 return default
2415 assert isinstance(ex_args, list)
2416 return ex_args
2417
2418
39672624
YCH
2419class ISO639Utils(object):
2420 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2421 _lang_map = {
2422 'aa': 'aar',
2423 'ab': 'abk',
2424 'ae': 'ave',
2425 'af': 'afr',
2426 'ak': 'aka',
2427 'am': 'amh',
2428 'an': 'arg',
2429 'ar': 'ara',
2430 'as': 'asm',
2431 'av': 'ava',
2432 'ay': 'aym',
2433 'az': 'aze',
2434 'ba': 'bak',
2435 'be': 'bel',
2436 'bg': 'bul',
2437 'bh': 'bih',
2438 'bi': 'bis',
2439 'bm': 'bam',
2440 'bn': 'ben',
2441 'bo': 'bod',
2442 'br': 'bre',
2443 'bs': 'bos',
2444 'ca': 'cat',
2445 'ce': 'che',
2446 'ch': 'cha',
2447 'co': 'cos',
2448 'cr': 'cre',
2449 'cs': 'ces',
2450 'cu': 'chu',
2451 'cv': 'chv',
2452 'cy': 'cym',
2453 'da': 'dan',
2454 'de': 'deu',
2455 'dv': 'div',
2456 'dz': 'dzo',
2457 'ee': 'ewe',
2458 'el': 'ell',
2459 'en': 'eng',
2460 'eo': 'epo',
2461 'es': 'spa',
2462 'et': 'est',
2463 'eu': 'eus',
2464 'fa': 'fas',
2465 'ff': 'ful',
2466 'fi': 'fin',
2467 'fj': 'fij',
2468 'fo': 'fao',
2469 'fr': 'fra',
2470 'fy': 'fry',
2471 'ga': 'gle',
2472 'gd': 'gla',
2473 'gl': 'glg',
2474 'gn': 'grn',
2475 'gu': 'guj',
2476 'gv': 'glv',
2477 'ha': 'hau',
2478 'he': 'heb',
2479 'hi': 'hin',
2480 'ho': 'hmo',
2481 'hr': 'hrv',
2482 'ht': 'hat',
2483 'hu': 'hun',
2484 'hy': 'hye',
2485 'hz': 'her',
2486 'ia': 'ina',
2487 'id': 'ind',
2488 'ie': 'ile',
2489 'ig': 'ibo',
2490 'ii': 'iii',
2491 'ik': 'ipk',
2492 'io': 'ido',
2493 'is': 'isl',
2494 'it': 'ita',
2495 'iu': 'iku',
2496 'ja': 'jpn',
2497 'jv': 'jav',
2498 'ka': 'kat',
2499 'kg': 'kon',
2500 'ki': 'kik',
2501 'kj': 'kua',
2502 'kk': 'kaz',
2503 'kl': 'kal',
2504 'km': 'khm',
2505 'kn': 'kan',
2506 'ko': 'kor',
2507 'kr': 'kau',
2508 'ks': 'kas',
2509 'ku': 'kur',
2510 'kv': 'kom',
2511 'kw': 'cor',
2512 'ky': 'kir',
2513 'la': 'lat',
2514 'lb': 'ltz',
2515 'lg': 'lug',
2516 'li': 'lim',
2517 'ln': 'lin',
2518 'lo': 'lao',
2519 'lt': 'lit',
2520 'lu': 'lub',
2521 'lv': 'lav',
2522 'mg': 'mlg',
2523 'mh': 'mah',
2524 'mi': 'mri',
2525 'mk': 'mkd',
2526 'ml': 'mal',
2527 'mn': 'mon',
2528 'mr': 'mar',
2529 'ms': 'msa',
2530 'mt': 'mlt',
2531 'my': 'mya',
2532 'na': 'nau',
2533 'nb': 'nob',
2534 'nd': 'nde',
2535 'ne': 'nep',
2536 'ng': 'ndo',
2537 'nl': 'nld',
2538 'nn': 'nno',
2539 'no': 'nor',
2540 'nr': 'nbl',
2541 'nv': 'nav',
2542 'ny': 'nya',
2543 'oc': 'oci',
2544 'oj': 'oji',
2545 'om': 'orm',
2546 'or': 'ori',
2547 'os': 'oss',
2548 'pa': 'pan',
2549 'pi': 'pli',
2550 'pl': 'pol',
2551 'ps': 'pus',
2552 'pt': 'por',
2553 'qu': 'que',
2554 'rm': 'roh',
2555 'rn': 'run',
2556 'ro': 'ron',
2557 'ru': 'rus',
2558 'rw': 'kin',
2559 'sa': 'san',
2560 'sc': 'srd',
2561 'sd': 'snd',
2562 'se': 'sme',
2563 'sg': 'sag',
2564 'si': 'sin',
2565 'sk': 'slk',
2566 'sl': 'slv',
2567 'sm': 'smo',
2568 'sn': 'sna',
2569 'so': 'som',
2570 'sq': 'sqi',
2571 'sr': 'srp',
2572 'ss': 'ssw',
2573 'st': 'sot',
2574 'su': 'sun',
2575 'sv': 'swe',
2576 'sw': 'swa',
2577 'ta': 'tam',
2578 'te': 'tel',
2579 'tg': 'tgk',
2580 'th': 'tha',
2581 'ti': 'tir',
2582 'tk': 'tuk',
2583 'tl': 'tgl',
2584 'tn': 'tsn',
2585 'to': 'ton',
2586 'tr': 'tur',
2587 'ts': 'tso',
2588 'tt': 'tat',
2589 'tw': 'twi',
2590 'ty': 'tah',
2591 'ug': 'uig',
2592 'uk': 'ukr',
2593 'ur': 'urd',
2594 'uz': 'uzb',
2595 've': 'ven',
2596 'vi': 'vie',
2597 'vo': 'vol',
2598 'wa': 'wln',
2599 'wo': 'wol',
2600 'xh': 'xho',
2601 'yi': 'yid',
2602 'yo': 'yor',
2603 'za': 'zha',
2604 'zh': 'zho',
2605 'zu': 'zul',
2606 }
2607
2608 @classmethod
2609 def short2long(cls, code):
2610 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2611 return cls._lang_map.get(code[:2])
2612
2613 @classmethod
2614 def long2short(cls, code):
2615 """Convert language code from ISO 639-2/T to ISO 639-1"""
2616 for short_name, long_name in cls._lang_map.items():
2617 if long_name == code:
2618 return short_name
2619
2620
4eb10f66
YCH
2621class ISO3166Utils(object):
2622 # From http://data.okfn.org/data/core/country-list
2623 _country_map = {
2624 'AF': 'Afghanistan',
2625 'AX': 'Åland Islands',
2626 'AL': 'Albania',
2627 'DZ': 'Algeria',
2628 'AS': 'American Samoa',
2629 'AD': 'Andorra',
2630 'AO': 'Angola',
2631 'AI': 'Anguilla',
2632 'AQ': 'Antarctica',
2633 'AG': 'Antigua and Barbuda',
2634 'AR': 'Argentina',
2635 'AM': 'Armenia',
2636 'AW': 'Aruba',
2637 'AU': 'Australia',
2638 'AT': 'Austria',
2639 'AZ': 'Azerbaijan',
2640 'BS': 'Bahamas',
2641 'BH': 'Bahrain',
2642 'BD': 'Bangladesh',
2643 'BB': 'Barbados',
2644 'BY': 'Belarus',
2645 'BE': 'Belgium',
2646 'BZ': 'Belize',
2647 'BJ': 'Benin',
2648 'BM': 'Bermuda',
2649 'BT': 'Bhutan',
2650 'BO': 'Bolivia, Plurinational State of',
2651 'BQ': 'Bonaire, Sint Eustatius and Saba',
2652 'BA': 'Bosnia and Herzegovina',
2653 'BW': 'Botswana',
2654 'BV': 'Bouvet Island',
2655 'BR': 'Brazil',
2656 'IO': 'British Indian Ocean Territory',
2657 'BN': 'Brunei Darussalam',
2658 'BG': 'Bulgaria',
2659 'BF': 'Burkina Faso',
2660 'BI': 'Burundi',
2661 'KH': 'Cambodia',
2662 'CM': 'Cameroon',
2663 'CA': 'Canada',
2664 'CV': 'Cape Verde',
2665 'KY': 'Cayman Islands',
2666 'CF': 'Central African Republic',
2667 'TD': 'Chad',
2668 'CL': 'Chile',
2669 'CN': 'China',
2670 'CX': 'Christmas Island',
2671 'CC': 'Cocos (Keeling) Islands',
2672 'CO': 'Colombia',
2673 'KM': 'Comoros',
2674 'CG': 'Congo',
2675 'CD': 'Congo, the Democratic Republic of the',
2676 'CK': 'Cook Islands',
2677 'CR': 'Costa Rica',
2678 'CI': 'Côte d\'Ivoire',
2679 'HR': 'Croatia',
2680 'CU': 'Cuba',
2681 'CW': 'Curaçao',
2682 'CY': 'Cyprus',
2683 'CZ': 'Czech Republic',
2684 'DK': 'Denmark',
2685 'DJ': 'Djibouti',
2686 'DM': 'Dominica',
2687 'DO': 'Dominican Republic',
2688 'EC': 'Ecuador',
2689 'EG': 'Egypt',
2690 'SV': 'El Salvador',
2691 'GQ': 'Equatorial Guinea',
2692 'ER': 'Eritrea',
2693 'EE': 'Estonia',
2694 'ET': 'Ethiopia',
2695 'FK': 'Falkland Islands (Malvinas)',
2696 'FO': 'Faroe Islands',
2697 'FJ': 'Fiji',
2698 'FI': 'Finland',
2699 'FR': 'France',
2700 'GF': 'French Guiana',
2701 'PF': 'French Polynesia',
2702 'TF': 'French Southern Territories',
2703 'GA': 'Gabon',
2704 'GM': 'Gambia',
2705 'GE': 'Georgia',
2706 'DE': 'Germany',
2707 'GH': 'Ghana',
2708 'GI': 'Gibraltar',
2709 'GR': 'Greece',
2710 'GL': 'Greenland',
2711 'GD': 'Grenada',
2712 'GP': 'Guadeloupe',
2713 'GU': 'Guam',
2714 'GT': 'Guatemala',
2715 'GG': 'Guernsey',
2716 'GN': 'Guinea',
2717 'GW': 'Guinea-Bissau',
2718 'GY': 'Guyana',
2719 'HT': 'Haiti',
2720 'HM': 'Heard Island and McDonald Islands',
2721 'VA': 'Holy See (Vatican City State)',
2722 'HN': 'Honduras',
2723 'HK': 'Hong Kong',
2724 'HU': 'Hungary',
2725 'IS': 'Iceland',
2726 'IN': 'India',
2727 'ID': 'Indonesia',
2728 'IR': 'Iran, Islamic Republic of',
2729 'IQ': 'Iraq',
2730 'IE': 'Ireland',
2731 'IM': 'Isle of Man',
2732 'IL': 'Israel',
2733 'IT': 'Italy',
2734 'JM': 'Jamaica',
2735 'JP': 'Japan',
2736 'JE': 'Jersey',
2737 'JO': 'Jordan',
2738 'KZ': 'Kazakhstan',
2739 'KE': 'Kenya',
2740 'KI': 'Kiribati',
2741 'KP': 'Korea, Democratic People\'s Republic of',
2742 'KR': 'Korea, Republic of',
2743 'KW': 'Kuwait',
2744 'KG': 'Kyrgyzstan',
2745 'LA': 'Lao People\'s Democratic Republic',
2746 'LV': 'Latvia',
2747 'LB': 'Lebanon',
2748 'LS': 'Lesotho',
2749 'LR': 'Liberia',
2750 'LY': 'Libya',
2751 'LI': 'Liechtenstein',
2752 'LT': 'Lithuania',
2753 'LU': 'Luxembourg',
2754 'MO': 'Macao',
2755 'MK': 'Macedonia, the Former Yugoslav Republic of',
2756 'MG': 'Madagascar',
2757 'MW': 'Malawi',
2758 'MY': 'Malaysia',
2759 'MV': 'Maldives',
2760 'ML': 'Mali',
2761 'MT': 'Malta',
2762 'MH': 'Marshall Islands',
2763 'MQ': 'Martinique',
2764 'MR': 'Mauritania',
2765 'MU': 'Mauritius',
2766 'YT': 'Mayotte',
2767 'MX': 'Mexico',
2768 'FM': 'Micronesia, Federated States of',
2769 'MD': 'Moldova, Republic of',
2770 'MC': 'Monaco',
2771 'MN': 'Mongolia',
2772 'ME': 'Montenegro',
2773 'MS': 'Montserrat',
2774 'MA': 'Morocco',
2775 'MZ': 'Mozambique',
2776 'MM': 'Myanmar',
2777 'NA': 'Namibia',
2778 'NR': 'Nauru',
2779 'NP': 'Nepal',
2780 'NL': 'Netherlands',
2781 'NC': 'New Caledonia',
2782 'NZ': 'New Zealand',
2783 'NI': 'Nicaragua',
2784 'NE': 'Niger',
2785 'NG': 'Nigeria',
2786 'NU': 'Niue',
2787 'NF': 'Norfolk Island',
2788 'MP': 'Northern Mariana Islands',
2789 'NO': 'Norway',
2790 'OM': 'Oman',
2791 'PK': 'Pakistan',
2792 'PW': 'Palau',
2793 'PS': 'Palestine, State of',
2794 'PA': 'Panama',
2795 'PG': 'Papua New Guinea',
2796 'PY': 'Paraguay',
2797 'PE': 'Peru',
2798 'PH': 'Philippines',
2799 'PN': 'Pitcairn',
2800 'PL': 'Poland',
2801 'PT': 'Portugal',
2802 'PR': 'Puerto Rico',
2803 'QA': 'Qatar',
2804 'RE': 'Réunion',
2805 'RO': 'Romania',
2806 'RU': 'Russian Federation',
2807 'RW': 'Rwanda',
2808 'BL': 'Saint Barthélemy',
2809 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2810 'KN': 'Saint Kitts and Nevis',
2811 'LC': 'Saint Lucia',
2812 'MF': 'Saint Martin (French part)',
2813 'PM': 'Saint Pierre and Miquelon',
2814 'VC': 'Saint Vincent and the Grenadines',
2815 'WS': 'Samoa',
2816 'SM': 'San Marino',
2817 'ST': 'Sao Tome and Principe',
2818 'SA': 'Saudi Arabia',
2819 'SN': 'Senegal',
2820 'RS': 'Serbia',
2821 'SC': 'Seychelles',
2822 'SL': 'Sierra Leone',
2823 'SG': 'Singapore',
2824 'SX': 'Sint Maarten (Dutch part)',
2825 'SK': 'Slovakia',
2826 'SI': 'Slovenia',
2827 'SB': 'Solomon Islands',
2828 'SO': 'Somalia',
2829 'ZA': 'South Africa',
2830 'GS': 'South Georgia and the South Sandwich Islands',
2831 'SS': 'South Sudan',
2832 'ES': 'Spain',
2833 'LK': 'Sri Lanka',
2834 'SD': 'Sudan',
2835 'SR': 'Suriname',
2836 'SJ': 'Svalbard and Jan Mayen',
2837 'SZ': 'Swaziland',
2838 'SE': 'Sweden',
2839 'CH': 'Switzerland',
2840 'SY': 'Syrian Arab Republic',
2841 'TW': 'Taiwan, Province of China',
2842 'TJ': 'Tajikistan',
2843 'TZ': 'Tanzania, United Republic of',
2844 'TH': 'Thailand',
2845 'TL': 'Timor-Leste',
2846 'TG': 'Togo',
2847 'TK': 'Tokelau',
2848 'TO': 'Tonga',
2849 'TT': 'Trinidad and Tobago',
2850 'TN': 'Tunisia',
2851 'TR': 'Turkey',
2852 'TM': 'Turkmenistan',
2853 'TC': 'Turks and Caicos Islands',
2854 'TV': 'Tuvalu',
2855 'UG': 'Uganda',
2856 'UA': 'Ukraine',
2857 'AE': 'United Arab Emirates',
2858 'GB': 'United Kingdom',
2859 'US': 'United States',
2860 'UM': 'United States Minor Outlying Islands',
2861 'UY': 'Uruguay',
2862 'UZ': 'Uzbekistan',
2863 'VU': 'Vanuatu',
2864 'VE': 'Venezuela, Bolivarian Republic of',
2865 'VN': 'Viet Nam',
2866 'VG': 'Virgin Islands, British',
2867 'VI': 'Virgin Islands, U.S.',
2868 'WF': 'Wallis and Futuna',
2869 'EH': 'Western Sahara',
2870 'YE': 'Yemen',
2871 'ZM': 'Zambia',
2872 'ZW': 'Zimbabwe',
2873 }
2874
2875 @classmethod
2876 def short2full(cls, code):
2877 """Convert an ISO 3166-2 country code to the corresponding full name"""
2878 return cls._country_map.get(code.upper())
2879
2880
91410c9b 2881class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2882 def __init__(self, proxies=None):
2883 # Set default handlers
2884 for type in ('http', 'https'):
2885 setattr(self, '%s_open' % type,
2886 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2887 meth(r, proxy, type))
2888 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2889
91410c9b 2890 def proxy_open(self, req, proxy, type):
2461f79d 2891 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2892 if req_proxy is not None:
2893 proxy = req_proxy
2461f79d
PH
2894 del req.headers['Ytdl-request-proxy']
2895
2896 if proxy == '__noproxy__':
2897 return None # No Proxy
51fb4995 2898 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2899 req.add_header('Ytdl-socks-proxy', proxy)
2900 # youtube-dl's http/https handlers do wrapping the socket with socks
2901 return None
91410c9b
PH
2902 return compat_urllib_request.ProxyHandler.proxy_open(
2903 self, req, proxy, type)
5bc880b9
YCH
2904
2905
2906def ohdave_rsa_encrypt(data, exponent, modulus):
2907 '''
2908 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2909
2910 Input:
2911 data: data to encrypt, bytes-like object
2912 exponent, modulus: parameter e and N of RSA algorithm, both integer
2913 Output: hex string of encrypted data
2914
2915 Limitation: supports one block encryption only
2916 '''
2917
2918 payload = int(binascii.hexlify(data[::-1]), 16)
2919 encrypted = pow(payload, exponent, modulus)
2920 return '%x' % encrypted
81bdc8fd
YCH
2921
2922
5eb6bdce 2923def encode_base_n(num, n, table=None):
59f898b7 2924 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2925 if not table:
2926 table = FULL_TABLE[:n]
2927
5eb6bdce
YCH
2928 if n > len(table):
2929 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2930
2931 if num == 0:
2932 return table[0]
2933
81bdc8fd
YCH
2934 ret = ''
2935 while num:
2936 ret = table[num % n] + ret
2937 num = num // n
2938 return ret
f52354a8
YCH
2939
2940
2941def decode_packed_codes(code):
2942 mobj = re.search(
680079be 2943 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2944 code)
2945 obfucasted_code, base, count, symbols = mobj.groups()
2946 base = int(base)
2947 count = int(count)
2948 symbols = symbols.split('|')
2949 symbol_table = {}
2950
2951 while count:
2952 count -= 1
5eb6bdce 2953 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2954 symbol_table[base_n_count] = symbols[count] or base_n_count
2955
2956 return re.sub(
2957 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2958 obfucasted_code)
e154c651 2959
2960
2961def parse_m3u8_attributes(attrib):
2962 info = {}
2963 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2964 if val.startswith('"'):
2965 val = val[1:-1]
2966 info[key] = val
2967 return info
1143535d
YCH
2968
2969
2970def urshift(val, n):
2971 return val >> n if val >= 0 else (val + 0x100000000) >> n