]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[tudou] Extract player URL from the webpage
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
bf42a990
S
65NO_DEFAULT = object()
66
7105440c
YCH
67ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
70
71
d77c3dfd 72def preferredencoding():
59ae15a5 73 """Get preferred encoding.
d77c3dfd 74
59ae15a5
PH
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
78 try:
79 pref = locale.getpreferredencoding()
28e614de 80 'TEST'.encode(pref)
70a1165b 81 except Exception:
59ae15a5 82 pref = 'UTF-8'
bae611f2 83
59ae15a5 84 return pref
d77c3dfd 85
f4bfd65f 86
181c8655 87def write_json_file(obj, fn):
1394646a 88 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 89
92120217 90 fn = encodeFilename(fn)
61ee5aeb 91 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
99 else:
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
102
73159f99
S
103 args = {
104 'suffix': '.tmp',
ec5f6016
JMF
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
73159f99
S
107 'delete': False,
108 }
109
181c8655
PH
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
73159f99 113 args['mode'] = 'wb'
181c8655 114 else:
73159f99
S
115 args.update({
116 'mode': 'w',
117 'encoding': 'utf-8',
118 })
119
c86b6142 120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
121
122 try:
123 with tf:
124 json.dump(obj, tf)
1394646a
IK
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
128 try:
129 os.unlink(fn)
130 except OSError:
131 pass
181c8655 132 os.rename(tf.name, fn)
70a1165b 133 except Exception:
181c8655
PH
134 try:
135 os.remove(tf.name)
136 except OSError:
137 pass
138 raise
139
140
141if sys.version_info >= (2, 7):
ee114368 142 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 143 """ Find the xpath xpath[@key=val] """
cbf915f3 144 assert re.match(r'^[a-zA-Z-]+$', key)
ee114368
S
145 if val:
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
148 return node.find(expr)
149else:
ee114368 150 def find_xpath_attr(node, xpath, key, val=None):
4eefbfdb
PH
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
8f9312c3 153 if isinstance(xpath, compat_str):
4eefbfdb
PH
154 xpath = xpath.encode('ascii')
155
59ae56fa 156 for f in node.findall(xpath):
ee114368
S
157 if key not in f.attrib:
158 continue
159 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
160 return f
161 return None
162
d7e66d39
JMF
163# On python2.6 the xml.etree.ElementTree.Element methods don't support
164# the namespace parameter
5f6a1245
JW
165
166
d7e66d39
JMF
167def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
169 replaced = []
170 for c in components:
171 if len(c) == 1:
172 replaced.append(c[0])
173 else:
174 ns, tag = c
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
177
d77c3dfd 178
bf42a990 179def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
d74bebd5
PH
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
182
bf0ff932 183 n = node.find(xpath)
42bdd9d0 184 if n is None or n.text is None:
bf42a990
S
185 if default is not NO_DEFAULT:
186 return default
187 elif fatal:
bf0ff932
PH
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
190 else:
191 return None
192 return n.text
193
194
9e6dd238 195def get_element_by_id(id, html):
43e8fafd
ND
196 """Return the content of the tag with the specified ID in the passed HTML document"""
197 return get_element_by_attribute("id", id, html)
198
12ea2f30 199
43e8fafd
ND
200def get_element_by_attribute(attribute, value, html):
201 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 202
38285056
PH
203 m = re.search(r'''(?xs)
204 <([a-zA-Z0-9:._-]+)
205 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
206 \s+%s=['"]?%s['"]?
207 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
208 \s*>
209 (?P<content>.*?)
210 </\1>
211 ''' % (re.escape(attribute), re.escape(value)), html)
212
213 if not m:
214 return None
215 res = m.group('content')
216
217 if res.startswith('"') or res.startswith("'"):
218 res = res[1:-1]
a921f407 219
38285056 220 return unescapeHTML(res)
a921f407 221
9e6dd238
FV
222
223def clean_html(html):
59ae15a5 224 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
225
226 if html is None: # Convenience for sanitizing descriptions etc.
227 return html
228
59ae15a5
PH
229 # Newline vs <br />
230 html = html.replace('\n', ' ')
6b3aef80
FV
231 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
232 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
233 # Strip html tags
234 html = re.sub('<.*?>', '', html)
235 # Replace html entities
236 html = unescapeHTML(html)
7decf895 237 return html.strip()
9e6dd238
FV
238
239
d77c3dfd 240def sanitize_open(filename, open_mode):
59ae15a5
PH
241 """Try to open the given filename, and slightly tweak it if this fails.
242
243 Attempts to open the given filename. If this fails, it tries to change
244 the filename slightly, step by step, until it's either able to open it
245 or it fails and raises a final exception, like the standard open()
246 function.
247
248 It returns the tuple (stream, definitive_file_name).
249 """
250 try:
28e614de 251 if filename == '-':
59ae15a5
PH
252 if sys.platform == 'win32':
253 import msvcrt
254 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 255 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
256 stream = open(encodeFilename(filename), open_mode)
257 return (stream, filename)
258 except (IOError, OSError) as err:
f45c185f
PH
259 if err.errno in (errno.EACCES,):
260 raise
59ae15a5 261
f45c185f 262 # In case of error, try to remove win32 forbidden chars
d55de57b 263 alt_filename = sanitize_path(filename)
f45c185f
PH
264 if alt_filename == filename:
265 raise
266 else:
267 # An exception here should be caught in the caller
d55de57b 268 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 269 return (stream, alt_filename)
d77c3dfd
FV
270
271
272def timeconvert(timestr):
59ae15a5
PH
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
1c469a94 279
5f6a1245 280
796173d0 281def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
282 """Sanitizes a string so it could be used as part of a filename.
283 If restricted is set, use a stricter subset of allowed characters.
796173d0 284 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
285 """
286 def replace_insane(char):
287 if char == '?' or ord(char) < 32 or ord(char) == 127:
288 return ''
289 elif char == '"':
290 return '' if restricted else '\''
291 elif char == ':':
292 return '_-' if restricted else ' -'
293 elif char in '\\/|*<>':
294 return '_'
627dcfff 295 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
296 return '_'
297 if restricted and ord(char) > 127:
298 return '_'
299 return char
300
2aeb06d6
PH
301 # Handle timestamps
302 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 303 result = ''.join(map(replace_insane, s))
796173d0
PH
304 if not is_id:
305 while '__' in result:
306 result = result.replace('__', '_')
307 result = result.strip('_')
308 # Common case of "Foreign band name - English song title"
309 if restricted and result.startswith('-_'):
310 result = result[2:]
5a42414b
PH
311 if result.startswith('-'):
312 result = '_' + result[len('-'):]
a7440261 313 result = result.lstrip('.')
796173d0
PH
314 if not result:
315 result = '_'
59ae15a5 316 return result
d77c3dfd 317
5f6a1245 318
a2aaf4db
S
319def sanitize_path(s):
320 """Sanitizes and normalizes path on Windows"""
321 if sys.platform != 'win32':
322 return s
be531ef1
S
323 drive_or_unc, _ = os.path.splitdrive(s)
324 if sys.version_info < (2, 7) and not drive_or_unc:
325 drive_or_unc, _ = os.path.splitunc(s)
326 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
327 if drive_or_unc:
a2aaf4db
S
328 norm_path.pop(0)
329 sanitized_path = [
2ebfeaca 330 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 331 for path_part in norm_path]
be531ef1
S
332 if drive_or_unc:
333 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
334 return os.path.join(*sanitized_path)
335
336
d77c3dfd 337def orderedSet(iterable):
59ae15a5
PH
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
d77c3dfd 344
912b38b4 345
4e408e47
PH
346def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
91757b0f 352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
353 if mobj is not None:
354 numstr = mobj.group(1)
28e614de 355 if numstr.startswith('x'):
4e408e47 356 base = 16
28e614de 357 numstr = '0%s' % numstr
4e408e47
PH
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
28e614de 363 return ('&%s;' % entity)
4e408e47
PH
364
365
d77c3dfd 366def unescapeHTML(s):
912b38b4
PH
367 if s is None:
368 return None
369 assert type(s) == compat_str
d77c3dfd 370
4e408e47
PH
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 373
8bf48f23 374
aa49acd1
S
375def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
8bf48f23 387def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
388 """
389 @param s The name of the file
390 """
d77c3dfd 391
8bf48f23 392 assert type(s) == compat_str
d77c3dfd 393
59ae15a5
PH
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
0f00efed 397
aa49acd1
S
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 416
f07b74fc
PH
417
418def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
7af808a5 422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
aa49acd1
S
427def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
8271226a
PH
431def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
1c256f70 439
5f6a1245 440
4539dd30
PH
441def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
a0ddb8a2 449
be4a824d
PH
450def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 454 if opts_no_check_certificate:
be5f2c19 455 context.check_hostname = False
0db261ba 456 context.verify_mode = ssl.CERT_NONE
a2366922 457 try:
be4a824d 458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
d7932313 465 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 466 else: # Python < 3.4
d7932313 467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 468 context.verify_mode = (ssl.CERT_NONE
dca08720 469 if opts_no_check_certificate
ea6d901e 470 else ssl.CERT_REQUIRED)
303b479e 471 context.set_default_verify_paths()
be4a824d 472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 473
732ea2f0 474
08f2a92c
JMF
475def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
1c256f70
PH
486class ExtractorError(Exception):
487 """Error during info extraction."""
5f6a1245 488
d11271dd 489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
d11271dd
PH
496 if video_id is not None:
497 msg = video_id + ': ' + msg
410f3e73 498 if cause:
28e614de 499 msg += ' (caused by %r)' % cause
9a82b238 500 if not expected:
08f2a92c 501 msg += bug_reports_message()
1c256f70 502 super(ExtractorError, self).__init__(msg)
d5979c5d 503
1c256f70 504 self.traceback = tb
8cc83b8d 505 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 506 self.cause = cause
d11271dd 507 self.video_id = video_id
1c256f70 508
01951dda
PH
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
28e614de 512 return ''.join(traceback.format_tb(self.traceback))
01951dda 513
1c256f70 514
416c7fcb
PH
515class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
55b3e45b
JMF
522class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
d77c3dfd 527class DownloadError(Exception):
59ae15a5 528 """Download Error exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
5f6a1245 534
8cc83b8d
FV
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
d77c3dfd
FV
539
540
541class SameFileError(Exception):
59ae15a5 542 """Same File exception.
d77c3dfd 543
59ae15a5
PH
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
d77c3dfd
FV
548
549
550class PostProcessingError(Exception):
59ae15a5 551 """Post Processing exception.
d77c3dfd 552
59ae15a5
PH
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
5f6a1245 556
7851b379
PH
557 def __init__(self, msg):
558 self.msg = msg
d77c3dfd 559
5f6a1245 560
d77c3dfd 561class MaxDownloadsReached(Exception):
59ae15a5
PH
562 """ --max-downloads limit has been reached. """
563 pass
d77c3dfd
FV
564
565
566class UnavailableVideoError(Exception):
59ae15a5 567 """Unavailable Format exception.
d77c3dfd 568
59ae15a5
PH
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
d77c3dfd
FV
573
574
575class ContentTooShortError(Exception):
59ae15a5 576 """Content Too Short exception.
d77c3dfd 577
59ae15a5
PH
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
d77c3dfd 582
59ae15a5 583 def __init__(self, downloaded, expected):
2c7ed247 584 # Both in bytes
59ae15a5
PH
585 self.downloaded = downloaded
586 self.expected = expected
d77c3dfd 587
5f6a1245 588
c5a59d93 589def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
590 hc = http_class(*args, **kwargs)
591 source_address = ydl_handler._params.get('source_address')
592 if source_address is not None:
593 sa = (source_address, 0)
594 if hasattr(hc, 'source_address'): # Python 2.7+
595 hc.source_address = sa
596 else: # Python 2.6
597 def _hc_connect(self, *args, **kwargs):
598 sock = compat_socket_create_connection(
599 (self.host, self.port), self.timeout, sa)
600 if is_https:
d7932313
PH
601 self.sock = ssl.wrap_socket(
602 sock, self.key_file, self.cert_file,
603 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
604 else:
605 self.sock = sock
606 hc.connect = functools.partial(_hc_connect, hc)
607
608 return hc
609
610
acebc9cd 611class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
612 """Handler for HTTP requests and responses.
613
614 This class, when installed with an OpenerDirector, automatically adds
615 the standard headers to every HTTP request and handles gzipped and
616 deflated responses from web servers. If compression is to be avoided in
617 a particular request, the original request in the program code only has
618 to include the HTTP header "Youtubedl-No-Compression", which will be
619 removed before making the real request.
620
621 Part of this code was copied from:
622
623 http://techknack.net/python-urllib2-handlers/
624
625 Andrew Rowls, the author of that code, agreed to release it to the
626 public domain.
627 """
628
be4a824d
PH
629 def __init__(self, params, *args, **kwargs):
630 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
631 self._params = params
632
633 def http_open(self, req):
634 return self.do_open(functools.partial(
c5a59d93 635 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
636 req)
637
59ae15a5
PH
638 @staticmethod
639 def deflate(data):
640 try:
641 return zlib.decompress(data, -zlib.MAX_WBITS)
642 except zlib.error:
643 return zlib.decompress(data)
644
645 @staticmethod
646 def addinfourl_wrapper(stream, headers, url, code):
647 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
648 return compat_urllib_request.addinfourl(stream, headers, url, code)
649 ret = compat_urllib_request.addinfourl(stream, headers, url)
650 ret.code = code
651 return ret
652
acebc9cd 653 def http_request(self, req):
33ac271b 654 for h, v in std_headers.items():
3d5f7a39
JK
655 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
656 # The dict keys are capitalized because of this bug by urllib
657 if h.capitalize() not in req.headers:
33ac271b 658 req.add_header(h, v)
59ae15a5
PH
659 if 'Youtubedl-no-compression' in req.headers:
660 if 'Accept-encoding' in req.headers:
661 del req.headers['Accept-encoding']
662 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
663
664 if sys.version_info < (2, 7) and '#' in req.get_full_url():
665 # Python 2.6 is brain-dead when it comes to fragments
666 req._Request__original = req._Request__original.partition('#')[0]
667 req._Request__r_type = req._Request__r_type.partition('#')[0]
668
59ae15a5
PH
669 return req
670
acebc9cd 671 def http_response(self, req, resp):
59ae15a5
PH
672 old_resp = resp
673 # gzip
674 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
675 content = resp.read()
676 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
677 try:
678 uncompressed = io.BytesIO(gz.read())
679 except IOError as original_ioerror:
680 # There may be junk add the end of the file
681 # See http://stackoverflow.com/q/4928560/35070 for details
682 for i in range(1, 1024):
683 try:
684 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
685 uncompressed = io.BytesIO(gz.read())
686 except IOError:
687 continue
688 break
689 else:
690 raise original_ioerror
691 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
692 resp.msg = old_resp.msg
693 # deflate
694 if resp.headers.get('Content-encoding', '') == 'deflate':
695 gz = io.BytesIO(self.deflate(resp.read()))
696 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
697 resp.msg = old_resp.msg
698 return resp
0f8d03f8 699
acebc9cd
PH
700 https_request = http_request
701 https_response = http_response
bf50b038 702
5de90176 703
be4a824d
PH
704class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
705 def __init__(self, params, https_conn_class=None, *args, **kwargs):
706 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
707 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
708 self._params = params
709
710 def https_open(self, req):
4f264c02
JMF
711 kwargs = {}
712 if hasattr(self, '_context'): # python > 2.6
713 kwargs['context'] = self._context
714 if hasattr(self, '_check_hostname'): # python 3.x
715 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
716 return self.do_open(functools.partial(
717 _create_http_connection, self, self._https_conn_class, True),
4f264c02 718 req, **kwargs)
be4a824d
PH
719
720
08b38d54 721def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
722 """ Return a UNIX timestamp from the given date """
723
724 if date_str is None:
725 return None
726
08b38d54
PH
727 if timezone is None:
728 m = re.search(
729 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
730 date_str)
731 if not m:
912b38b4
PH
732 timezone = datetime.timedelta()
733 else:
08b38d54
PH
734 date_str = date_str[:-len(m.group(0))]
735 if not m.group('sign'):
736 timezone = datetime.timedelta()
737 else:
738 sign = 1 if m.group('sign') == '+' else -1
739 timezone = datetime.timedelta(
740 hours=sign * int(m.group('hours')),
741 minutes=sign * int(m.group('minutes')))
6ad4013d 742 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 743 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
744 return calendar.timegm(dt.timetuple())
745
746
42bdd9d0 747def unified_strdate(date_str, day_first=True):
bf50b038 748 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
749
750 if date_str is None:
751 return None
bf50b038 752 upload_date = None
5f6a1245 753 # Replace commas
026fcc04 754 date_str = date_str.replace(',', ' ')
bf50b038 755 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
756 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
757 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 758 # Remove AM/PM + timezone
9bb8e0a3 759 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 760
19e1d359
JMF
761 format_expressions = [
762 '%d %B %Y',
0f99566c 763 '%d %b %Y',
19e1d359
JMF
764 '%B %d %Y',
765 '%b %d %Y',
78ff59d0
PP
766 '%b %dst %Y %I:%M%p',
767 '%b %dnd %Y %I:%M%p',
768 '%b %dth %Y %I:%M%p',
a69801e2 769 '%Y %m %d',
19e1d359 770 '%Y-%m-%d',
fe556f1b 771 '%Y/%m/%d',
19e1d359 772 '%Y/%m/%d %H:%M:%S',
5d73273f 773 '%Y-%m-%d %H:%M:%S',
e9be9a6a 774 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 775 '%d.%m.%Y %H:%M',
b047de6f 776 '%d.%m.%Y %H.%M',
19e1d359 777 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
778 '%Y-%m-%dT%H:%M:%S.%fZ',
779 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 780 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 781 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 782 '%Y-%m-%dT%H:%M',
19e1d359 783 ]
42bdd9d0
PH
784 if day_first:
785 format_expressions.extend([
79c21abb 786 '%d-%m-%Y',
776dc399
S
787 '%d.%m.%Y',
788 '%d/%m/%Y',
789 '%d/%m/%y',
42bdd9d0
PH
790 '%d/%m/%Y %H:%M:%S',
791 ])
792 else:
793 format_expressions.extend([
79c21abb 794 '%m-%d-%Y',
776dc399
S
795 '%m.%d.%Y',
796 '%m/%d/%Y',
797 '%m/%d/%y',
42bdd9d0
PH
798 '%m/%d/%Y %H:%M:%S',
799 ])
bf50b038
JMF
800 for expression in format_expressions:
801 try:
802 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 803 except ValueError:
bf50b038 804 pass
42393ce2
PH
805 if upload_date is None:
806 timetuple = email.utils.parsedate_tz(date_str)
807 if timetuple:
808 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
809 return upload_date
810
5f6a1245 811
28e614de 812def determine_ext(url, default_ext='unknown_video'):
f4776371
S
813 if url is None:
814 return default_ext
28e614de 815 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
816 if re.match(r'^[A-Za-z0-9]+$', guess):
817 return guess
818 else:
cbdbb766 819 return default_ext
73e79f2a 820
5f6a1245 821
d4051a8e 822def subtitles_filename(filename, sub_lang, sub_format):
28e614de 823 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 824
5f6a1245 825
bd558525 826def date_from_str(date_str):
37254abc
JMF
827 """
828 Return a datetime object from a string in the format YYYYMMDD or
829 (now|today)[+-][0-9](day|week|month|year)(s)?"""
830 today = datetime.date.today()
f8795e10 831 if date_str in ('now', 'today'):
37254abc 832 return today
f8795e10
PH
833 if date_str == 'yesterday':
834 return today - datetime.timedelta(days=1)
37254abc
JMF
835 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
836 if match is not None:
837 sign = match.group('sign')
838 time = int(match.group('time'))
839 if sign == '-':
840 time = -time
841 unit = match.group('unit')
5f6a1245 842 # A bad aproximation?
37254abc
JMF
843 if unit == 'month':
844 unit = 'day'
845 time *= 30
846 elif unit == 'year':
847 unit = 'day'
848 time *= 365
849 unit += 's'
850 delta = datetime.timedelta(**{unit: time})
851 return today + delta
bd558525 852 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
853
854
e63fc1be 855def hyphenate_date(date_str):
856 """
857 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
858 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
859 if match is not None:
860 return '-'.join(match.groups())
861 else:
862 return date_str
863
5f6a1245 864
bd558525
JMF
865class DateRange(object):
866 """Represents a time interval between two dates"""
5f6a1245 867
bd558525
JMF
868 def __init__(self, start=None, end=None):
869 """start and end must be strings in the format accepted by date"""
870 if start is not None:
871 self.start = date_from_str(start)
872 else:
873 self.start = datetime.datetime.min.date()
874 if end is not None:
875 self.end = date_from_str(end)
876 else:
877 self.end = datetime.datetime.max.date()
37254abc 878 if self.start > self.end:
bd558525 879 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 880
bd558525
JMF
881 @classmethod
882 def day(cls, day):
883 """Returns a range that only contains the given day"""
5f6a1245
JW
884 return cls(day, day)
885
bd558525
JMF
886 def __contains__(self, date):
887 """Check if the date is in the range"""
37254abc
JMF
888 if not isinstance(date, datetime.date):
889 date = date_from_str(date)
890 return self.start <= date <= self.end
5f6a1245 891
bd558525 892 def __str__(self):
5f6a1245 893 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
894
895
896def platform_name():
897 """ Returns the platform name as a compat_str """
898 res = platform.platform()
899 if isinstance(res, bytes):
900 res = res.decode(preferredencoding())
901
902 assert isinstance(res, compat_str)
903 return res
c257baff
PH
904
905
b58ddb32
PH
906def _windows_write_string(s, out):
907 """ Returns True if the string was written using special methods,
908 False if it has yet to be written out."""
909 # Adapted from http://stackoverflow.com/a/3259271/35070
910
911 import ctypes
912 import ctypes.wintypes
913
914 WIN_OUTPUT_IDS = {
915 1: -11,
916 2: -12,
917 }
918
a383a98a
PH
919 try:
920 fileno = out.fileno()
921 except AttributeError:
922 # If the output stream doesn't have a fileno, it's virtual
923 return False
aa42e873
PH
924 except io.UnsupportedOperation:
925 # Some strange Windows pseudo files?
926 return False
b58ddb32
PH
927 if fileno not in WIN_OUTPUT_IDS:
928 return False
929
e2f89ec7 930 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 931 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 932 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
933 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
934
e2f89ec7 935 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
936 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
937 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 938 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
939 written = ctypes.wintypes.DWORD(0)
940
6ac4e806 941 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
942 FILE_TYPE_CHAR = 0x0002
943 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 944 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
945 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
946 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 947 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
948 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
949
950 def not_a_console(handle):
951 if handle == INVALID_HANDLE_VALUE or handle is None:
952 return True
8fb3ac36
PH
953 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
954 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
955
956 if not_a_console(h):
957 return False
958
d1b9c912
PH
959 def next_nonbmp_pos(s):
960 try:
961 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
962 except StopIteration:
963 return len(s)
964
965 while s:
966 count = min(next_nonbmp_pos(s), 1024)
967
b58ddb32 968 ret = WriteConsoleW(
d1b9c912 969 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
970 if ret == 0:
971 raise OSError('Failed to write string')
d1b9c912
PH
972 if not count: # We just wrote a non-BMP character
973 assert written.value == 2
974 s = s[1:]
975 else:
976 assert written.value > 0
977 s = s[written.value:]
b58ddb32
PH
978 return True
979
980
734f90bb 981def write_string(s, out=None, encoding=None):
7459e3a2
PH
982 if out is None:
983 out = sys.stderr
8bf48f23 984 assert type(s) == compat_str
7459e3a2 985
b58ddb32
PH
986 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
987 if _windows_write_string(s, out):
988 return
989
7459e3a2
PH
990 if ('b' in getattr(out, 'mode', '') or
991 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
992 byt = s.encode(encoding or preferredencoding(), 'ignore')
993 out.write(byt)
994 elif hasattr(out, 'buffer'):
995 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
996 byt = s.encode(enc, 'ignore')
997 out.buffer.write(byt)
998 else:
8bf48f23 999 out.write(s)
7459e3a2
PH
1000 out.flush()
1001
1002
48ea9cea
PH
1003def bytes_to_intlist(bs):
1004 if not bs:
1005 return []
1006 if isinstance(bs[0], int): # Python 3
1007 return list(bs)
1008 else:
1009 return [ord(c) for c in bs]
1010
c257baff 1011
cba892fa 1012def intlist_to_bytes(xs):
1013 if not xs:
1014 return b''
eb4157fd 1015 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1016
1017
c1c9a79c
PH
1018# Cross-platform file locking
1019if sys.platform == 'win32':
1020 import ctypes.wintypes
1021 import msvcrt
1022
1023 class OVERLAPPED(ctypes.Structure):
1024 _fields_ = [
1025 ('Internal', ctypes.wintypes.LPVOID),
1026 ('InternalHigh', ctypes.wintypes.LPVOID),
1027 ('Offset', ctypes.wintypes.DWORD),
1028 ('OffsetHigh', ctypes.wintypes.DWORD),
1029 ('hEvent', ctypes.wintypes.HANDLE),
1030 ]
1031
1032 kernel32 = ctypes.windll.kernel32
1033 LockFileEx = kernel32.LockFileEx
1034 LockFileEx.argtypes = [
1035 ctypes.wintypes.HANDLE, # hFile
1036 ctypes.wintypes.DWORD, # dwFlags
1037 ctypes.wintypes.DWORD, # dwReserved
1038 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1039 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1040 ctypes.POINTER(OVERLAPPED) # Overlapped
1041 ]
1042 LockFileEx.restype = ctypes.wintypes.BOOL
1043 UnlockFileEx = kernel32.UnlockFileEx
1044 UnlockFileEx.argtypes = [
1045 ctypes.wintypes.HANDLE, # hFile
1046 ctypes.wintypes.DWORD, # dwReserved
1047 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1048 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1049 ctypes.POINTER(OVERLAPPED) # Overlapped
1050 ]
1051 UnlockFileEx.restype = ctypes.wintypes.BOOL
1052 whole_low = 0xffffffff
1053 whole_high = 0x7fffffff
1054
1055 def _lock_file(f, exclusive):
1056 overlapped = OVERLAPPED()
1057 overlapped.Offset = 0
1058 overlapped.OffsetHigh = 0
1059 overlapped.hEvent = 0
1060 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1061 handle = msvcrt.get_osfhandle(f.fileno())
1062 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1063 whole_low, whole_high, f._lock_file_overlapped_p):
1064 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1065
1066 def _unlock_file(f):
1067 assert f._lock_file_overlapped_p
1068 handle = msvcrt.get_osfhandle(f.fileno())
1069 if not UnlockFileEx(handle, 0,
1070 whole_low, whole_high, f._lock_file_overlapped_p):
1071 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1072
1073else:
1074 import fcntl
1075
1076 def _lock_file(f, exclusive):
2582bebe 1077 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1078
1079 def _unlock_file(f):
2582bebe 1080 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1081
1082
1083class locked_file(object):
1084 def __init__(self, filename, mode, encoding=None):
1085 assert mode in ['r', 'a', 'w']
1086 self.f = io.open(filename, mode, encoding=encoding)
1087 self.mode = mode
1088
1089 def __enter__(self):
1090 exclusive = self.mode != 'r'
1091 try:
1092 _lock_file(self.f, exclusive)
1093 except IOError:
1094 self.f.close()
1095 raise
1096 return self
1097
1098 def __exit__(self, etype, value, traceback):
1099 try:
1100 _unlock_file(self.f)
1101 finally:
1102 self.f.close()
1103
1104 def __iter__(self):
1105 return iter(self.f)
1106
1107 def write(self, *args):
1108 return self.f.write(*args)
1109
1110 def read(self, *args):
1111 return self.f.read(*args)
4eb7f1d1
JMF
1112
1113
4644ac55
S
1114def get_filesystem_encoding():
1115 encoding = sys.getfilesystemencoding()
1116 return encoding if encoding is not None else 'utf-8'
1117
1118
4eb7f1d1 1119def shell_quote(args):
a6a173c2 1120 quoted_args = []
4644ac55 1121 encoding = get_filesystem_encoding()
a6a173c2
JMF
1122 for a in args:
1123 if isinstance(a, bytes):
1124 # We may get a filename encoded with 'encodeFilename'
1125 a = a.decode(encoding)
1126 quoted_args.append(pipes.quote(a))
28e614de 1127 return ' '.join(quoted_args)
9d4660ca
PH
1128
1129
1130def smuggle_url(url, data):
1131 """ Pass additional data in a URL for internal use. """
1132
1133 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1134 {'__youtubedl_smuggle': json.dumps(data)})
1135 return url + '#' + sdata
9d4660ca
PH
1136
1137
79f82953 1138def unsmuggle_url(smug_url, default=None):
83e865a3 1139 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1140 return smug_url, default
28e614de
PH
1141 url, _, sdata = smug_url.rpartition('#')
1142 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1143 data = json.loads(jsond)
1144 return url, data
02dbf93f
PH
1145
1146
02dbf93f
PH
1147def format_bytes(bytes):
1148 if bytes is None:
28e614de 1149 return 'N/A'
02dbf93f
PH
1150 if type(bytes) is str:
1151 bytes = float(bytes)
1152 if bytes == 0.0:
1153 exponent = 0
1154 else:
1155 exponent = int(math.log(bytes, 1024.0))
28e614de 1156 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1157 converted = float(bytes) / float(1024 ** exponent)
28e614de 1158 return '%.2f%s' % (converted, suffix)
f53c966a 1159
1c088fa8 1160
be64b5b0
PH
1161def parse_filesize(s):
1162 if s is None:
1163 return None
1164
1165 # The lower-case forms are of course incorrect and inofficial,
1166 # but we support those too
1167 _UNIT_TABLE = {
1168 'B': 1,
1169 'b': 1,
1170 'KiB': 1024,
1171 'KB': 1000,
1172 'kB': 1024,
1173 'Kb': 1000,
1174 'MiB': 1024 ** 2,
1175 'MB': 1000 ** 2,
1176 'mB': 1024 ** 2,
1177 'Mb': 1000 ** 2,
1178 'GiB': 1024 ** 3,
1179 'GB': 1000 ** 3,
1180 'gB': 1024 ** 3,
1181 'Gb': 1000 ** 3,
1182 'TiB': 1024 ** 4,
1183 'TB': 1000 ** 4,
1184 'tB': 1024 ** 4,
1185 'Tb': 1000 ** 4,
1186 'PiB': 1024 ** 5,
1187 'PB': 1000 ** 5,
1188 'pB': 1024 ** 5,
1189 'Pb': 1000 ** 5,
1190 'EiB': 1024 ** 6,
1191 'EB': 1000 ** 6,
1192 'eB': 1024 ** 6,
1193 'Eb': 1000 ** 6,
1194 'ZiB': 1024 ** 7,
1195 'ZB': 1000 ** 7,
1196 'zB': 1024 ** 7,
1197 'Zb': 1000 ** 7,
1198 'YiB': 1024 ** 8,
1199 'YB': 1000 ** 8,
1200 'yB': 1024 ** 8,
1201 'Yb': 1000 ** 8,
1202 }
1203
1204 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1205 m = re.match(
1206 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1207 if not m:
1208 return None
1209
4349c07d
PH
1210 num_str = m.group('num').replace(',', '.')
1211 mult = _UNIT_TABLE[m.group('unit')]
1212 return int(float(num_str) * mult)
be64b5b0
PH
1213
1214
caefb1de
PH
1215def month_by_name(name):
1216 """ Return the number of a month by (locale-independently) English name """
1217
caefb1de 1218 try:
7105440c
YCH
1219 return ENGLISH_MONTH_NAMES.index(name) + 1
1220 except ValueError:
1221 return None
1222
1223
1224def month_by_abbreviation(abbrev):
1225 """ Return the number of a month by (locale-independently) English
1226 abbreviations """
1227
1228 try:
1229 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1230 except ValueError:
1231 return None
18258362
JMF
1232
1233
5aafe895 1234def fix_xml_ampersands(xml_str):
18258362 1235 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1236 return re.sub(
1237 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1238 '&amp;',
5aafe895 1239 xml_str)
e3946f98
PH
1240
1241
1242def setproctitle(title):
8bf48f23 1243 assert isinstance(title, compat_str)
e3946f98
PH
1244 try:
1245 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1246 except OSError:
1247 return
6eefe533
PH
1248 title_bytes = title.encode('utf-8')
1249 buf = ctypes.create_string_buffer(len(title_bytes))
1250 buf.value = title_bytes
e3946f98 1251 try:
6eefe533 1252 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1253 except AttributeError:
1254 return # Strange libc, just skip this
d7dda168
PH
1255
1256
1257def remove_start(s, start):
1258 if s.startswith(start):
1259 return s[len(start):]
1260 return s
29eb5174
PH
1261
1262
2b9faf55
PH
1263def remove_end(s, end):
1264 if s.endswith(end):
1265 return s[:-len(end)]
1266 return s
1267
1268
29eb5174 1269def url_basename(url):
9b8aaeed 1270 path = compat_urlparse.urlparse(url).path
28e614de 1271 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1272
1273
1274class HEADRequest(compat_urllib_request.Request):
1275 def get_method(self):
1276 return "HEAD"
7217e148
PH
1277
1278
9732d77e 1279def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1280 if get_attr:
1281 if v is not None:
1282 v = getattr(v, get_attr, None)
9572013d
PH
1283 if v == '':
1284 v = None
9732d77e
PH
1285 return default if v is None else (int(v) * invscale // scale)
1286
9572013d 1287
40a90862
JMF
1288def str_or_none(v, default=None):
1289 return default if v is None else compat_str(v)
1290
9732d77e
PH
1291
1292def str_to_int(int_str):
48d4681e 1293 """ A more relaxed version of int_or_none """
9732d77e
PH
1294 if int_str is None:
1295 return None
28e614de 1296 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1297 return int(int_str)
608d11f5
PH
1298
1299
9732d77e
PH
1300def float_or_none(v, scale=1, invscale=1, default=None):
1301 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1302
1303
608d11f5 1304def parse_duration(s):
8f9312c3 1305 if not isinstance(s, compat_basestring):
608d11f5
PH
1306 return None
1307
ca7b3246
S
1308 s = s.strip()
1309
608d11f5 1310 m = re.match(
9d22a7df 1311 r'''(?ix)(?:P?T)?
e8df5cee 1312 (?:
9c29bc69 1313 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1314 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1315
9c29bc69 1316 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1317 (?:
8f4b58d7
PH
1318 (?:
1319 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1320 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1321 )?
6a68bb57
PH
1322 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1323 )?
e8df5cee
PH
1324 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1325 )$''', s)
608d11f5
PH
1326 if not m:
1327 return None
e8df5cee
PH
1328 res = 0
1329 if m.group('only_mins'):
1330 return float_or_none(m.group('only_mins'), invscale=60)
1331 if m.group('only_hours'):
1332 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1333 if m.group('secs'):
1334 res += int(m.group('secs'))
3e675fab
PH
1335 if m.group('mins_reversed'):
1336 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1337 if m.group('mins'):
1338 res += int(m.group('mins')) * 60
e8df5cee
PH
1339 if m.group('hours'):
1340 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1341 if m.group('hours_reversed'):
1342 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1343 if m.group('days'):
1344 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1345 if m.group('ms'):
1346 res += float(m.group('ms'))
608d11f5 1347 return res
91d7d0b3
JMF
1348
1349
e65e4c88 1350def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1351 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1352 return (
1353 '{0}.{1}{2}'.format(name, ext, real_ext)
1354 if not expected_real_ext or real_ext[1:] == expected_real_ext
1355 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1356
1357
b3ed15b7
S
1358def replace_extension(filename, ext, expected_real_ext=None):
1359 name, real_ext = os.path.splitext(filename)
1360 return '{0}.{1}'.format(
1361 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1362 ext)
1363
1364
d70ad093
PH
1365def check_executable(exe, args=[]):
1366 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1367 args can be a list of arguments for a short output (like -version) """
1368 try:
1369 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1370 except OSError:
1371 return False
1372 return exe
b7ab0590
PH
1373
1374
95807118 1375def get_exe_version(exe, args=['--version'],
cae97f65 1376 version_re=None, unrecognized='present'):
95807118
PH
1377 """ Returns the version of the specified executable,
1378 or False if the executable is not present """
1379 try:
cae97f65 1380 out, _ = subprocess.Popen(
54116803 1381 [encodeArgument(exe)] + args,
95807118
PH
1382 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1383 except OSError:
1384 return False
cae97f65
PH
1385 if isinstance(out, bytes): # Python 2.x
1386 out = out.decode('ascii', 'ignore')
1387 return detect_exe_version(out, version_re, unrecognized)
1388
1389
1390def detect_exe_version(output, version_re=None, unrecognized='present'):
1391 assert isinstance(output, compat_str)
1392 if version_re is None:
1393 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1394 m = re.search(version_re, output)
95807118
PH
1395 if m:
1396 return m.group(1)
1397 else:
1398 return unrecognized
1399
1400
b7ab0590 1401class PagedList(object):
dd26ced1
PH
1402 def __len__(self):
1403 # This is only useful for tests
1404 return len(self.getslice())
1405
9c44d242
PH
1406
1407class OnDemandPagedList(PagedList):
1408 def __init__(self, pagefunc, pagesize):
1409 self._pagefunc = pagefunc
1410 self._pagesize = pagesize
1411
b7ab0590
PH
1412 def getslice(self, start=0, end=None):
1413 res = []
1414 for pagenum in itertools.count(start // self._pagesize):
1415 firstid = pagenum * self._pagesize
1416 nextfirstid = pagenum * self._pagesize + self._pagesize
1417 if start >= nextfirstid:
1418 continue
1419
1420 page_results = list(self._pagefunc(pagenum))
1421
1422 startv = (
1423 start % self._pagesize
1424 if firstid <= start < nextfirstid
1425 else 0)
1426
1427 endv = (
1428 ((end - 1) % self._pagesize) + 1
1429 if (end is not None and firstid <= end <= nextfirstid)
1430 else None)
1431
1432 if startv != 0 or endv is not None:
1433 page_results = page_results[startv:endv]
1434 res.extend(page_results)
1435
1436 # A little optimization - if current page is not "full", ie. does
1437 # not contain page_size videos then we can assume that this page
1438 # is the last one - there are no more ids on further pages -
1439 # i.e. no need to query again.
1440 if len(page_results) + startv < self._pagesize:
1441 break
1442
1443 # If we got the whole page, but the next page is not interesting,
1444 # break out early as well
1445 if end == nextfirstid:
1446 break
1447 return res
81c2f20b
PH
1448
1449
9c44d242
PH
1450class InAdvancePagedList(PagedList):
1451 def __init__(self, pagefunc, pagecount, pagesize):
1452 self._pagefunc = pagefunc
1453 self._pagecount = pagecount
1454 self._pagesize = pagesize
1455
1456 def getslice(self, start=0, end=None):
1457 res = []
1458 start_page = start // self._pagesize
1459 end_page = (
1460 self._pagecount if end is None else (end // self._pagesize + 1))
1461 skip_elems = start - start_page * self._pagesize
1462 only_more = None if end is None else end - start
1463 for pagenum in range(start_page, end_page):
1464 page = list(self._pagefunc(pagenum))
1465 if skip_elems:
1466 page = page[skip_elems:]
1467 skip_elems = None
1468 if only_more is not None:
1469 if len(page) < only_more:
1470 only_more -= len(page)
1471 else:
1472 page = page[:only_more]
1473 res.extend(page)
1474 break
1475 res.extend(page)
1476 return res
1477
1478
81c2f20b 1479def uppercase_escape(s):
676eb3f2 1480 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1481 return re.sub(
a612753d 1482 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1483 lambda m: unicode_escape(m.group(0))[0],
1484 s)
0fe2ff78
YCH
1485
1486
1487def lowercase_escape(s):
1488 unicode_escape = codecs.getdecoder('unicode_escape')
1489 return re.sub(
1490 r'\\u[0-9a-fA-F]{4}',
1491 lambda m: unicode_escape(m.group(0))[0],
1492 s)
b53466e1 1493
d05cfe06
S
1494
1495def escape_rfc3986(s):
1496 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1497 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1498 s = s.encode('utf-8')
ecc0c5ee 1499 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1500
1501
1502def escape_url(url):
1503 """Escape URL as suggested by RFC 3986"""
1504 url_parsed = compat_urllib_parse_urlparse(url)
1505 return url_parsed._replace(
1506 path=escape_rfc3986(url_parsed.path),
1507 params=escape_rfc3986(url_parsed.params),
1508 query=escape_rfc3986(url_parsed.query),
1509 fragment=escape_rfc3986(url_parsed.fragment)
1510 ).geturl()
1511
b53466e1 1512try:
28e614de 1513 struct.pack('!I', 0)
b53466e1
PH
1514except TypeError:
1515 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1516 def struct_pack(spec, *args):
1517 if isinstance(spec, compat_str):
1518 spec = spec.encode('ascii')
1519 return struct.pack(spec, *args)
1520
1521 def struct_unpack(spec, *args):
1522 if isinstance(spec, compat_str):
1523 spec = spec.encode('ascii')
1524 return struct.unpack(spec, *args)
1525else:
1526 struct_pack = struct.pack
1527 struct_unpack = struct.unpack
62e609ab
PH
1528
1529
1530def read_batch_urls(batch_fd):
1531 def fixup(url):
1532 if not isinstance(url, compat_str):
1533 url = url.decode('utf-8', 'replace')
28e614de 1534 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1535 if url.startswith(BOM_UTF8):
1536 url = url[len(BOM_UTF8):]
1537 url = url.strip()
1538 if url.startswith(('#', ';', ']')):
1539 return False
1540 return url
1541
1542 with contextlib.closing(batch_fd) as fd:
1543 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1544
1545
1546def urlencode_postdata(*args, **kargs):
1547 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1548
1549
0990305d
PH
1550try:
1551 etree_iter = xml.etree.ElementTree.Element.iter
1552except AttributeError: # Python <=2.6
1553 etree_iter = lambda n: n.findall('.//*')
1554
1555
bcf89ce6
PH
1556def parse_xml(s):
1557 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1558 def doctype(self, name, pubid, system):
1559 pass # Ignore doctypes
1560
1561 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1562 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1563 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1564 # Fix up XML parser in Python 2.x
1565 if sys.version_info < (3, 0):
1566 for n in etree_iter(tree):
1567 if n.text is not None:
1568 if not isinstance(n.text, compat_str):
1569 n.text = n.text.decode('utf-8')
1570 return tree
e68301af
PH
1571
1572
a1a530b0
PH
1573US_RATINGS = {
1574 'G': 0,
1575 'PG': 10,
1576 'PG-13': 13,
1577 'R': 16,
1578 'NC': 18,
1579}
fac55558
PH
1580
1581
146c80e2
S
1582def parse_age_limit(s):
1583 if s is None:
d838b1bd 1584 return None
146c80e2 1585 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1586 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1587
1588
fac55558 1589def strip_jsonp(code):
609a61e3
PH
1590 return re.sub(
1591 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1592
1593
e05f6939
PH
1594def js_to_json(code):
1595 def fix_kv(m):
e7b6d122
PH
1596 v = m.group(0)
1597 if v in ('true', 'false', 'null'):
1598 return v
1599 if v.startswith('"'):
1600 return v
1601 if v.startswith("'"):
1602 v = v[1:-1]
1603 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1604 '\\\\': '\\\\',
1605 "\\'": "'",
1606 '"': '\\"',
1607 }[m.group(0)], v)
1608 return '"%s"' % v
e05f6939
PH
1609
1610 res = re.sub(r'''(?x)
d305dd73
PH
1611 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1612 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1613 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1614 ''', fix_kv, code)
ba9e68f4 1615 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1616 return res
1617
1618
478c2c61
PH
1619def qualities(quality_ids):
1620 """ Get a numeric quality value out of a list of possible values """
1621 def q(qid):
1622 try:
1623 return quality_ids.index(qid)
1624 except ValueError:
1625 return -1
1626 return q
1627
acd69589
PH
1628
1629DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1630
a020a0dc
PH
1631
1632def limit_length(s, length):
1633 """ Add ellipses to overly long strings """
1634 if s is None:
1635 return None
1636 ELLIPSES = '...'
1637 if len(s) > length:
1638 return s[:length - len(ELLIPSES)] + ELLIPSES
1639 return s
48844745
PH
1640
1641
1642def version_tuple(v):
5f9b8394 1643 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1644
1645
1646def is_outdated_version(version, limit, assume_new=True):
1647 if not version:
1648 return not assume_new
1649 try:
1650 return version_tuple(version) < version_tuple(limit)
1651 except ValueError:
1652 return not assume_new
732ea2f0
PH
1653
1654
1655def ytdl_is_updateable():
1656 """ Returns if youtube-dl can be updated with -U """
1657 from zipimport import zipimporter
1658
1659 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1660
1661
1662def args_to_str(args):
1663 # Get a short string representation for a subprocess command
1664 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1665
1666
c460bdd5
PH
1667def mimetype2ext(mt):
1668 _, _, res = mt.rpartition('/')
1669
1670 return {
1671 'x-ms-wmv': 'wmv',
1672 'x-mp4-fragmented': 'mp4',
ecee5724 1673 'ttml+xml': 'ttml',
c460bdd5
PH
1674 }.get(res, res)
1675
1676
2ccd1b10
PH
1677def urlhandle_detect_ext(url_handle):
1678 try:
1679 url_handle.headers
1680 getheader = lambda h: url_handle.headers[h]
1681 except AttributeError: # Python < 3
1682 getheader = url_handle.info().getheader
1683
b55ee18f
PH
1684 cd = getheader('Content-Disposition')
1685 if cd:
1686 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1687 if m:
1688 e = determine_ext(m.group('filename'), default_ext=None)
1689 if e:
1690 return e
1691
c460bdd5 1692 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1693
1694
1695def age_restricted(content_limit, age_limit):
1696 """ Returns True iff the content should be blocked """
1697
1698 if age_limit is None: # No limit set
1699 return False
1700 if content_limit is None:
1701 return False # Content available for everyone
1702 return age_limit < content_limit
61ca9a80
PH
1703
1704
1705def is_html(first_bytes):
1706 """ Detect whether a file contains HTML by examining its first bytes. """
1707
1708 BOMS = [
1709 (b'\xef\xbb\xbf', 'utf-8'),
1710 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1711 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1712 (b'\xff\xfe', 'utf-16-le'),
1713 (b'\xfe\xff', 'utf-16-be'),
1714 ]
1715 for bom, enc in BOMS:
1716 if first_bytes.startswith(bom):
1717 s = first_bytes[len(bom):].decode(enc, 'replace')
1718 break
1719 else:
1720 s = first_bytes.decode('utf-8', 'replace')
1721
1722 return re.match(r'^\s*<', s)
a055469f
PH
1723
1724
1725def determine_protocol(info_dict):
1726 protocol = info_dict.get('protocol')
1727 if protocol is not None:
1728 return protocol
1729
1730 url = info_dict['url']
1731 if url.startswith('rtmp'):
1732 return 'rtmp'
1733 elif url.startswith('mms'):
1734 return 'mms'
1735 elif url.startswith('rtsp'):
1736 return 'rtsp'
1737
1738 ext = determine_ext(url)
1739 if ext == 'm3u8':
1740 return 'm3u8'
1741 elif ext == 'f4m':
1742 return 'f4m'
1743
1744 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1745
1746
1747def render_table(header_row, data):
1748 """ Render a list of rows, each as a list of values """
1749 table = [header_row] + data
1750 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1751 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1752 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1753
1754
1755def _match_one(filter_part, dct):
1756 COMPARISON_OPERATORS = {
1757 '<': operator.lt,
1758 '<=': operator.le,
1759 '>': operator.gt,
1760 '>=': operator.ge,
1761 '=': operator.eq,
1762 '!=': operator.ne,
1763 }
1764 operator_rex = re.compile(r'''(?x)\s*
1765 (?P<key>[a-z_]+)
1766 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1767 (?:
1768 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1769 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1770 )
1771 \s*$
1772 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1773 m = operator_rex.search(filter_part)
1774 if m:
1775 op = COMPARISON_OPERATORS[m.group('op')]
1776 if m.group('strval') is not None:
1777 if m.group('op') not in ('=', '!='):
1778 raise ValueError(
1779 'Operator %s does not support string values!' % m.group('op'))
1780 comparison_value = m.group('strval')
1781 else:
1782 try:
1783 comparison_value = int(m.group('intval'))
1784 except ValueError:
1785 comparison_value = parse_filesize(m.group('intval'))
1786 if comparison_value is None:
1787 comparison_value = parse_filesize(m.group('intval') + 'B')
1788 if comparison_value is None:
1789 raise ValueError(
1790 'Invalid integer value %r in filter part %r' % (
1791 m.group('intval'), filter_part))
1792 actual_value = dct.get(m.group('key'))
1793 if actual_value is None:
1794 return m.group('none_inclusive')
1795 return op(actual_value, comparison_value)
1796
1797 UNARY_OPERATORS = {
1798 '': lambda v: v is not None,
1799 '!': lambda v: v is None,
1800 }
1801 operator_rex = re.compile(r'''(?x)\s*
1802 (?P<op>%s)\s*(?P<key>[a-z_]+)
1803 \s*$
1804 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1805 m = operator_rex.search(filter_part)
1806 if m:
1807 op = UNARY_OPERATORS[m.group('op')]
1808 actual_value = dct.get(m.group('key'))
1809 return op(actual_value)
1810
1811 raise ValueError('Invalid filter part %r' % filter_part)
1812
1813
1814def match_str(filter_str, dct):
1815 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1816
1817 return all(
1818 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1819
1820
1821def match_filter_func(filter_str):
1822 def _match_func(info_dict):
1823 if match_str(filter_str, info_dict):
1824 return None
1825 else:
1826 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1827 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1828 return _match_func
91410c9b
PH
1829
1830
bf6427d2
YCH
1831def parse_dfxp_time_expr(time_expr):
1832 if not time_expr:
1833 return 0.0
1834
1835 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1836 if mobj:
1837 return float(mobj.group('time_offset'))
1838
1839 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1840 if mobj:
1841 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1842
1843
c1c924ab
YCH
1844def srt_subtitles_timecode(seconds):
1845 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
1846
1847
1848def dfxp2srt(dfxp_data):
4e335771
YCH
1849 _x = functools.partial(xpath_with_ns, ns_map={
1850 'ttml': 'http://www.w3.org/ns/ttml',
1851 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1852 })
bf6427d2
YCH
1853
1854 def parse_node(node):
1855 str_or_empty = functools.partial(str_or_none, default='')
1856
1857 out = str_or_empty(node.text)
1858
1859 for child in node:
4e335771 1860 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
bf6427d2 1861 out += '\n' + str_or_empty(child.tail)
4e335771 1862 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
bf6427d2
YCH
1863 out += str_or_empty(parse_node(child))
1864 else:
1865 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1866
1867 return out
1868
1869 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1870 out = []
4e335771 1871 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
1872
1873 if not paras:
1874 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
1875
1876 for para, index in zip(paras, itertools.count(1)):
7dff0363
YCH
1877 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1878 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1879 if not end_time:
1880 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
bf6427d2
YCH
1881 out.append('%d\n%s --> %s\n%s\n\n' % (
1882 index,
c1c924ab
YCH
1883 srt_subtitles_timecode(begin_time),
1884 srt_subtitles_timecode(end_time),
bf6427d2
YCH
1885 parse_node(para)))
1886
1887 return ''.join(out)
1888
1889
39672624
YCH
1890class ISO639Utils(object):
1891 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1892 _lang_map = {
1893 'aa': 'aar',
1894 'ab': 'abk',
1895 'ae': 'ave',
1896 'af': 'afr',
1897 'ak': 'aka',
1898 'am': 'amh',
1899 'an': 'arg',
1900 'ar': 'ara',
1901 'as': 'asm',
1902 'av': 'ava',
1903 'ay': 'aym',
1904 'az': 'aze',
1905 'ba': 'bak',
1906 'be': 'bel',
1907 'bg': 'bul',
1908 'bh': 'bih',
1909 'bi': 'bis',
1910 'bm': 'bam',
1911 'bn': 'ben',
1912 'bo': 'bod',
1913 'br': 'bre',
1914 'bs': 'bos',
1915 'ca': 'cat',
1916 'ce': 'che',
1917 'ch': 'cha',
1918 'co': 'cos',
1919 'cr': 'cre',
1920 'cs': 'ces',
1921 'cu': 'chu',
1922 'cv': 'chv',
1923 'cy': 'cym',
1924 'da': 'dan',
1925 'de': 'deu',
1926 'dv': 'div',
1927 'dz': 'dzo',
1928 'ee': 'ewe',
1929 'el': 'ell',
1930 'en': 'eng',
1931 'eo': 'epo',
1932 'es': 'spa',
1933 'et': 'est',
1934 'eu': 'eus',
1935 'fa': 'fas',
1936 'ff': 'ful',
1937 'fi': 'fin',
1938 'fj': 'fij',
1939 'fo': 'fao',
1940 'fr': 'fra',
1941 'fy': 'fry',
1942 'ga': 'gle',
1943 'gd': 'gla',
1944 'gl': 'glg',
1945 'gn': 'grn',
1946 'gu': 'guj',
1947 'gv': 'glv',
1948 'ha': 'hau',
1949 'he': 'heb',
1950 'hi': 'hin',
1951 'ho': 'hmo',
1952 'hr': 'hrv',
1953 'ht': 'hat',
1954 'hu': 'hun',
1955 'hy': 'hye',
1956 'hz': 'her',
1957 'ia': 'ina',
1958 'id': 'ind',
1959 'ie': 'ile',
1960 'ig': 'ibo',
1961 'ii': 'iii',
1962 'ik': 'ipk',
1963 'io': 'ido',
1964 'is': 'isl',
1965 'it': 'ita',
1966 'iu': 'iku',
1967 'ja': 'jpn',
1968 'jv': 'jav',
1969 'ka': 'kat',
1970 'kg': 'kon',
1971 'ki': 'kik',
1972 'kj': 'kua',
1973 'kk': 'kaz',
1974 'kl': 'kal',
1975 'km': 'khm',
1976 'kn': 'kan',
1977 'ko': 'kor',
1978 'kr': 'kau',
1979 'ks': 'kas',
1980 'ku': 'kur',
1981 'kv': 'kom',
1982 'kw': 'cor',
1983 'ky': 'kir',
1984 'la': 'lat',
1985 'lb': 'ltz',
1986 'lg': 'lug',
1987 'li': 'lim',
1988 'ln': 'lin',
1989 'lo': 'lao',
1990 'lt': 'lit',
1991 'lu': 'lub',
1992 'lv': 'lav',
1993 'mg': 'mlg',
1994 'mh': 'mah',
1995 'mi': 'mri',
1996 'mk': 'mkd',
1997 'ml': 'mal',
1998 'mn': 'mon',
1999 'mr': 'mar',
2000 'ms': 'msa',
2001 'mt': 'mlt',
2002 'my': 'mya',
2003 'na': 'nau',
2004 'nb': 'nob',
2005 'nd': 'nde',
2006 'ne': 'nep',
2007 'ng': 'ndo',
2008 'nl': 'nld',
2009 'nn': 'nno',
2010 'no': 'nor',
2011 'nr': 'nbl',
2012 'nv': 'nav',
2013 'ny': 'nya',
2014 'oc': 'oci',
2015 'oj': 'oji',
2016 'om': 'orm',
2017 'or': 'ori',
2018 'os': 'oss',
2019 'pa': 'pan',
2020 'pi': 'pli',
2021 'pl': 'pol',
2022 'ps': 'pus',
2023 'pt': 'por',
2024 'qu': 'que',
2025 'rm': 'roh',
2026 'rn': 'run',
2027 'ro': 'ron',
2028 'ru': 'rus',
2029 'rw': 'kin',
2030 'sa': 'san',
2031 'sc': 'srd',
2032 'sd': 'snd',
2033 'se': 'sme',
2034 'sg': 'sag',
2035 'si': 'sin',
2036 'sk': 'slk',
2037 'sl': 'slv',
2038 'sm': 'smo',
2039 'sn': 'sna',
2040 'so': 'som',
2041 'sq': 'sqi',
2042 'sr': 'srp',
2043 'ss': 'ssw',
2044 'st': 'sot',
2045 'su': 'sun',
2046 'sv': 'swe',
2047 'sw': 'swa',
2048 'ta': 'tam',
2049 'te': 'tel',
2050 'tg': 'tgk',
2051 'th': 'tha',
2052 'ti': 'tir',
2053 'tk': 'tuk',
2054 'tl': 'tgl',
2055 'tn': 'tsn',
2056 'to': 'ton',
2057 'tr': 'tur',
2058 'ts': 'tso',
2059 'tt': 'tat',
2060 'tw': 'twi',
2061 'ty': 'tah',
2062 'ug': 'uig',
2063 'uk': 'ukr',
2064 'ur': 'urd',
2065 'uz': 'uzb',
2066 've': 'ven',
2067 'vi': 'vie',
2068 'vo': 'vol',
2069 'wa': 'wln',
2070 'wo': 'wol',
2071 'xh': 'xho',
2072 'yi': 'yid',
2073 'yo': 'yor',
2074 'za': 'zha',
2075 'zh': 'zho',
2076 'zu': 'zul',
2077 }
2078
2079 @classmethod
2080 def short2long(cls, code):
2081 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2082 return cls._lang_map.get(code[:2])
2083
2084 @classmethod
2085 def long2short(cls, code):
2086 """Convert language code from ISO 639-2/T to ISO 639-1"""
2087 for short_name, long_name in cls._lang_map.items():
2088 if long_name == code:
2089 return short_name
2090
2091
4eb10f66
YCH
2092class ISO3166Utils(object):
2093 # From http://data.okfn.org/data/core/country-list
2094 _country_map = {
2095 'AF': 'Afghanistan',
2096 'AX': 'Åland Islands',
2097 'AL': 'Albania',
2098 'DZ': 'Algeria',
2099 'AS': 'American Samoa',
2100 'AD': 'Andorra',
2101 'AO': 'Angola',
2102 'AI': 'Anguilla',
2103 'AQ': 'Antarctica',
2104 'AG': 'Antigua and Barbuda',
2105 'AR': 'Argentina',
2106 'AM': 'Armenia',
2107 'AW': 'Aruba',
2108 'AU': 'Australia',
2109 'AT': 'Austria',
2110 'AZ': 'Azerbaijan',
2111 'BS': 'Bahamas',
2112 'BH': 'Bahrain',
2113 'BD': 'Bangladesh',
2114 'BB': 'Barbados',
2115 'BY': 'Belarus',
2116 'BE': 'Belgium',
2117 'BZ': 'Belize',
2118 'BJ': 'Benin',
2119 'BM': 'Bermuda',
2120 'BT': 'Bhutan',
2121 'BO': 'Bolivia, Plurinational State of',
2122 'BQ': 'Bonaire, Sint Eustatius and Saba',
2123 'BA': 'Bosnia and Herzegovina',
2124 'BW': 'Botswana',
2125 'BV': 'Bouvet Island',
2126 'BR': 'Brazil',
2127 'IO': 'British Indian Ocean Territory',
2128 'BN': 'Brunei Darussalam',
2129 'BG': 'Bulgaria',
2130 'BF': 'Burkina Faso',
2131 'BI': 'Burundi',
2132 'KH': 'Cambodia',
2133 'CM': 'Cameroon',
2134 'CA': 'Canada',
2135 'CV': 'Cape Verde',
2136 'KY': 'Cayman Islands',
2137 'CF': 'Central African Republic',
2138 'TD': 'Chad',
2139 'CL': 'Chile',
2140 'CN': 'China',
2141 'CX': 'Christmas Island',
2142 'CC': 'Cocos (Keeling) Islands',
2143 'CO': 'Colombia',
2144 'KM': 'Comoros',
2145 'CG': 'Congo',
2146 'CD': 'Congo, the Democratic Republic of the',
2147 'CK': 'Cook Islands',
2148 'CR': 'Costa Rica',
2149 'CI': 'Côte d\'Ivoire',
2150 'HR': 'Croatia',
2151 'CU': 'Cuba',
2152 'CW': 'Curaçao',
2153 'CY': 'Cyprus',
2154 'CZ': 'Czech Republic',
2155 'DK': 'Denmark',
2156 'DJ': 'Djibouti',
2157 'DM': 'Dominica',
2158 'DO': 'Dominican Republic',
2159 'EC': 'Ecuador',
2160 'EG': 'Egypt',
2161 'SV': 'El Salvador',
2162 'GQ': 'Equatorial Guinea',
2163 'ER': 'Eritrea',
2164 'EE': 'Estonia',
2165 'ET': 'Ethiopia',
2166 'FK': 'Falkland Islands (Malvinas)',
2167 'FO': 'Faroe Islands',
2168 'FJ': 'Fiji',
2169 'FI': 'Finland',
2170 'FR': 'France',
2171 'GF': 'French Guiana',
2172 'PF': 'French Polynesia',
2173 'TF': 'French Southern Territories',
2174 'GA': 'Gabon',
2175 'GM': 'Gambia',
2176 'GE': 'Georgia',
2177 'DE': 'Germany',
2178 'GH': 'Ghana',
2179 'GI': 'Gibraltar',
2180 'GR': 'Greece',
2181 'GL': 'Greenland',
2182 'GD': 'Grenada',
2183 'GP': 'Guadeloupe',
2184 'GU': 'Guam',
2185 'GT': 'Guatemala',
2186 'GG': 'Guernsey',
2187 'GN': 'Guinea',
2188 'GW': 'Guinea-Bissau',
2189 'GY': 'Guyana',
2190 'HT': 'Haiti',
2191 'HM': 'Heard Island and McDonald Islands',
2192 'VA': 'Holy See (Vatican City State)',
2193 'HN': 'Honduras',
2194 'HK': 'Hong Kong',
2195 'HU': 'Hungary',
2196 'IS': 'Iceland',
2197 'IN': 'India',
2198 'ID': 'Indonesia',
2199 'IR': 'Iran, Islamic Republic of',
2200 'IQ': 'Iraq',
2201 'IE': 'Ireland',
2202 'IM': 'Isle of Man',
2203 'IL': 'Israel',
2204 'IT': 'Italy',
2205 'JM': 'Jamaica',
2206 'JP': 'Japan',
2207 'JE': 'Jersey',
2208 'JO': 'Jordan',
2209 'KZ': 'Kazakhstan',
2210 'KE': 'Kenya',
2211 'KI': 'Kiribati',
2212 'KP': 'Korea, Democratic People\'s Republic of',
2213 'KR': 'Korea, Republic of',
2214 'KW': 'Kuwait',
2215 'KG': 'Kyrgyzstan',
2216 'LA': 'Lao People\'s Democratic Republic',
2217 'LV': 'Latvia',
2218 'LB': 'Lebanon',
2219 'LS': 'Lesotho',
2220 'LR': 'Liberia',
2221 'LY': 'Libya',
2222 'LI': 'Liechtenstein',
2223 'LT': 'Lithuania',
2224 'LU': 'Luxembourg',
2225 'MO': 'Macao',
2226 'MK': 'Macedonia, the Former Yugoslav Republic of',
2227 'MG': 'Madagascar',
2228 'MW': 'Malawi',
2229 'MY': 'Malaysia',
2230 'MV': 'Maldives',
2231 'ML': 'Mali',
2232 'MT': 'Malta',
2233 'MH': 'Marshall Islands',
2234 'MQ': 'Martinique',
2235 'MR': 'Mauritania',
2236 'MU': 'Mauritius',
2237 'YT': 'Mayotte',
2238 'MX': 'Mexico',
2239 'FM': 'Micronesia, Federated States of',
2240 'MD': 'Moldova, Republic of',
2241 'MC': 'Monaco',
2242 'MN': 'Mongolia',
2243 'ME': 'Montenegro',
2244 'MS': 'Montserrat',
2245 'MA': 'Morocco',
2246 'MZ': 'Mozambique',
2247 'MM': 'Myanmar',
2248 'NA': 'Namibia',
2249 'NR': 'Nauru',
2250 'NP': 'Nepal',
2251 'NL': 'Netherlands',
2252 'NC': 'New Caledonia',
2253 'NZ': 'New Zealand',
2254 'NI': 'Nicaragua',
2255 'NE': 'Niger',
2256 'NG': 'Nigeria',
2257 'NU': 'Niue',
2258 'NF': 'Norfolk Island',
2259 'MP': 'Northern Mariana Islands',
2260 'NO': 'Norway',
2261 'OM': 'Oman',
2262 'PK': 'Pakistan',
2263 'PW': 'Palau',
2264 'PS': 'Palestine, State of',
2265 'PA': 'Panama',
2266 'PG': 'Papua New Guinea',
2267 'PY': 'Paraguay',
2268 'PE': 'Peru',
2269 'PH': 'Philippines',
2270 'PN': 'Pitcairn',
2271 'PL': 'Poland',
2272 'PT': 'Portugal',
2273 'PR': 'Puerto Rico',
2274 'QA': 'Qatar',
2275 'RE': 'Réunion',
2276 'RO': 'Romania',
2277 'RU': 'Russian Federation',
2278 'RW': 'Rwanda',
2279 'BL': 'Saint Barthélemy',
2280 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2281 'KN': 'Saint Kitts and Nevis',
2282 'LC': 'Saint Lucia',
2283 'MF': 'Saint Martin (French part)',
2284 'PM': 'Saint Pierre and Miquelon',
2285 'VC': 'Saint Vincent and the Grenadines',
2286 'WS': 'Samoa',
2287 'SM': 'San Marino',
2288 'ST': 'Sao Tome and Principe',
2289 'SA': 'Saudi Arabia',
2290 'SN': 'Senegal',
2291 'RS': 'Serbia',
2292 'SC': 'Seychelles',
2293 'SL': 'Sierra Leone',
2294 'SG': 'Singapore',
2295 'SX': 'Sint Maarten (Dutch part)',
2296 'SK': 'Slovakia',
2297 'SI': 'Slovenia',
2298 'SB': 'Solomon Islands',
2299 'SO': 'Somalia',
2300 'ZA': 'South Africa',
2301 'GS': 'South Georgia and the South Sandwich Islands',
2302 'SS': 'South Sudan',
2303 'ES': 'Spain',
2304 'LK': 'Sri Lanka',
2305 'SD': 'Sudan',
2306 'SR': 'Suriname',
2307 'SJ': 'Svalbard and Jan Mayen',
2308 'SZ': 'Swaziland',
2309 'SE': 'Sweden',
2310 'CH': 'Switzerland',
2311 'SY': 'Syrian Arab Republic',
2312 'TW': 'Taiwan, Province of China',
2313 'TJ': 'Tajikistan',
2314 'TZ': 'Tanzania, United Republic of',
2315 'TH': 'Thailand',
2316 'TL': 'Timor-Leste',
2317 'TG': 'Togo',
2318 'TK': 'Tokelau',
2319 'TO': 'Tonga',
2320 'TT': 'Trinidad and Tobago',
2321 'TN': 'Tunisia',
2322 'TR': 'Turkey',
2323 'TM': 'Turkmenistan',
2324 'TC': 'Turks and Caicos Islands',
2325 'TV': 'Tuvalu',
2326 'UG': 'Uganda',
2327 'UA': 'Ukraine',
2328 'AE': 'United Arab Emirates',
2329 'GB': 'United Kingdom',
2330 'US': 'United States',
2331 'UM': 'United States Minor Outlying Islands',
2332 'UY': 'Uruguay',
2333 'UZ': 'Uzbekistan',
2334 'VU': 'Vanuatu',
2335 'VE': 'Venezuela, Bolivarian Republic of',
2336 'VN': 'Viet Nam',
2337 'VG': 'Virgin Islands, British',
2338 'VI': 'Virgin Islands, U.S.',
2339 'WF': 'Wallis and Futuna',
2340 'EH': 'Western Sahara',
2341 'YE': 'Yemen',
2342 'ZM': 'Zambia',
2343 'ZW': 'Zimbabwe',
2344 }
2345
2346 @classmethod
2347 def short2full(cls, code):
2348 """Convert an ISO 3166-2 country code to the corresponding full name"""
2349 return cls._country_map.get(code.upper())
2350
2351
91410c9b 2352class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2353 def __init__(self, proxies=None):
2354 # Set default handlers
2355 for type in ('http', 'https'):
2356 setattr(self, '%s_open' % type,
2357 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2358 meth(r, proxy, type))
2359 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2360
91410c9b 2361 def proxy_open(self, req, proxy, type):
2461f79d 2362 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2363 if req_proxy is not None:
2364 proxy = req_proxy
2461f79d
PH
2365 del req.headers['Ytdl-request-proxy']
2366
2367 if proxy == '__noproxy__':
2368 return None # No Proxy
91410c9b
PH
2369 return compat_urllib_request.ProxyHandler.proxy_open(
2370 self, req, proxy, type)