]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[litv] Add new extractor
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import struct
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParser,
39 compat_basestring,
40 compat_chr,
41 compat_etree_fromstring,
42 compat_html_entities,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_socket_create_connection,
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
50 compat_urllib_parse_urlencode,
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
54 compat_xpath,
55 shlex_quote,
56 )
57
58
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
61
62 std_headers = {
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
68 }
69
70
71 NO_DEFAULT = object()
72
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
77 KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
92 # needed for sanitizing filenames in restricted mode
93 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
94 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
95 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
96
97
98 def preferredencoding():
99 """Get preferred encoding.
100
101 Returns the best encoding scheme for the system, based on
102 locale.getpreferredencoding() and some further tweaks.
103 """
104 try:
105 pref = locale.getpreferredencoding()
106 'TEST'.encode(pref)
107 except Exception:
108 pref = 'UTF-8'
109
110 return pref
111
112
113 def write_json_file(obj, fn):
114 """ Encode obj as JSON and write it to fn, atomically if possible """
115
116 fn = encodeFilename(fn)
117 if sys.version_info < (3, 0) and sys.platform != 'win32':
118 encoding = get_filesystem_encoding()
119 # os.path.basename returns a bytes object, but NamedTemporaryFile
120 # will fail if the filename contains non ascii characters unless we
121 # use a unicode object
122 path_basename = lambda f: os.path.basename(fn).decode(encoding)
123 # the same for os.path.dirname
124 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
125 else:
126 path_basename = os.path.basename
127 path_dirname = os.path.dirname
128
129 args = {
130 'suffix': '.tmp',
131 'prefix': path_basename(fn) + '.',
132 'dir': path_dirname(fn),
133 'delete': False,
134 }
135
136 # In Python 2.x, json.dump expects a bytestream.
137 # In Python 3.x, it writes to a character stream
138 if sys.version_info < (3, 0):
139 args['mode'] = 'wb'
140 else:
141 args.update({
142 'mode': 'w',
143 'encoding': 'utf-8',
144 })
145
146 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
147
148 try:
149 with tf:
150 json.dump(obj, tf)
151 if sys.platform == 'win32':
152 # Need to remove existing file on Windows, else os.rename raises
153 # WindowsError or FileExistsError.
154 try:
155 os.unlink(fn)
156 except OSError:
157 pass
158 os.rename(tf.name, fn)
159 except Exception:
160 try:
161 os.remove(tf.name)
162 except OSError:
163 pass
164 raise
165
166
167 if sys.version_info >= (2, 7):
168 def find_xpath_attr(node, xpath, key, val=None):
169 """ Find the xpath xpath[@key=val] """
170 assert re.match(r'^[a-zA-Z_-]+$', key)
171 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
172 return node.find(expr)
173 else:
174 def find_xpath_attr(node, xpath, key, val=None):
175 for f in node.findall(compat_xpath(xpath)):
176 if key not in f.attrib:
177 continue
178 if val is None or f.attrib.get(key) == val:
179 return f
180 return None
181
182 # On python2.6 the xml.etree.ElementTree.Element methods don't support
183 # the namespace parameter
184
185
186 def xpath_with_ns(path, ns_map):
187 components = [c.split(':') for c in path.split('/')]
188 replaced = []
189 for c in components:
190 if len(c) == 1:
191 replaced.append(c[0])
192 else:
193 ns, tag = c
194 replaced.append('{%s}%s' % (ns_map[ns], tag))
195 return '/'.join(replaced)
196
197
198 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
199 def _find_xpath(xpath):
200 return node.find(compat_xpath(xpath))
201
202 if isinstance(xpath, (str, compat_str)):
203 n = _find_xpath(xpath)
204 else:
205 for xp in xpath:
206 n = _find_xpath(xp)
207 if n is not None:
208 break
209
210 if n is None:
211 if default is not NO_DEFAULT:
212 return default
213 elif fatal:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element %s' % name)
216 else:
217 return None
218 return n
219
220
221 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
222 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
223 if n is None or n == default:
224 return n
225 if n.text is None:
226 if default is not NO_DEFAULT:
227 return default
228 elif fatal:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element\'s text %s' % name)
231 else:
232 return None
233 return n.text
234
235
236 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
237 n = find_xpath_attr(node, xpath, key)
238 if n is None:
239 if default is not NO_DEFAULT:
240 return default
241 elif fatal:
242 name = '%s[@%s]' % (xpath, key) if name is None else name
243 raise ExtractorError('Could not find XML attribute %s' % name)
244 else:
245 return None
246 return n.attrib[key]
247
248
249 def get_element_by_id(id, html):
250 """Return the content of the tag with the specified ID in the passed HTML document"""
251 return get_element_by_attribute('id', id, html)
252
253
254 def get_element_by_attribute(attribute, value, html):
255 """Return the content of the tag with the specified attribute in the passed HTML document"""
256
257 m = re.search(r'''(?xs)
258 <([a-zA-Z0-9:._-]+)
259 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
260 \s+%s=['"]?%s['"]?
261 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
262 \s*>
263 (?P<content>.*?)
264 </\1>
265 ''' % (re.escape(attribute), re.escape(value)), html)
266
267 if not m:
268 return None
269 res = m.group('content')
270
271 if res.startswith('"') or res.startswith("'"):
272 res = res[1:-1]
273
274 return unescapeHTML(res)
275
276
277 class HTMLAttributeParser(compat_HTMLParser):
278 """Trivial HTML parser to gather the attributes for a single element"""
279 def __init__(self):
280 self.attrs = {}
281 compat_HTMLParser.__init__(self)
282
283 def handle_starttag(self, tag, attrs):
284 self.attrs = dict(attrs)
285
286
287 def extract_attributes(html_element):
288 """Given a string for an HTML element such as
289 <el
290 a="foo" B="bar" c="&98;az" d=boz
291 empty= noval entity="&amp;"
292 sq='"' dq="'"
293 >
294 Decode and return a dictionary of attributes.
295 {
296 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
297 'empty': '', 'noval': None, 'entity': '&',
298 'sq': '"', 'dq': '\''
299 }.
300 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
301 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
302 """
303 parser = HTMLAttributeParser()
304 parser.feed(html_element)
305 parser.close()
306 return parser.attrs
307
308
309 def clean_html(html):
310 """Clean an HTML snippet into a readable string"""
311
312 if html is None: # Convenience for sanitizing descriptions etc.
313 return html
314
315 # Newline vs <br />
316 html = html.replace('\n', ' ')
317 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
318 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
319 # Strip html tags
320 html = re.sub('<.*?>', '', html)
321 # Replace html entities
322 html = unescapeHTML(html)
323 return html.strip()
324
325
326 def sanitize_open(filename, open_mode):
327 """Try to open the given filename, and slightly tweak it if this fails.
328
329 Attempts to open the given filename. If this fails, it tries to change
330 the filename slightly, step by step, until it's either able to open it
331 or it fails and raises a final exception, like the standard open()
332 function.
333
334 It returns the tuple (stream, definitive_file_name).
335 """
336 try:
337 if filename == '-':
338 if sys.platform == 'win32':
339 import msvcrt
340 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
341 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
342 stream = open(encodeFilename(filename), open_mode)
343 return (stream, filename)
344 except (IOError, OSError) as err:
345 if err.errno in (errno.EACCES,):
346 raise
347
348 # In case of error, try to remove win32 forbidden chars
349 alt_filename = sanitize_path(filename)
350 if alt_filename == filename:
351 raise
352 else:
353 # An exception here should be caught in the caller
354 stream = open(encodeFilename(alt_filename), open_mode)
355 return (stream, alt_filename)
356
357
358 def timeconvert(timestr):
359 """Convert RFC 2822 defined time string into system timestamp"""
360 timestamp = None
361 timetuple = email.utils.parsedate_tz(timestr)
362 if timetuple is not None:
363 timestamp = email.utils.mktime_tz(timetuple)
364 return timestamp
365
366
367 def sanitize_filename(s, restricted=False, is_id=False):
368 """Sanitizes a string so it could be used as part of a filename.
369 If restricted is set, use a stricter subset of allowed characters.
370 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
371 """
372 def replace_insane(char):
373 if restricted and char in ACCENT_CHARS:
374 return ACCENT_CHARS[char]
375 if char == '?' or ord(char) < 32 or ord(char) == 127:
376 return ''
377 elif char == '"':
378 return '' if restricted else '\''
379 elif char == ':':
380 return '_-' if restricted else ' -'
381 elif char in '\\/|*<>':
382 return '_'
383 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
384 return '_'
385 if restricted and ord(char) > 127:
386 return '_'
387 return char
388
389 # Handle timestamps
390 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
391 result = ''.join(map(replace_insane, s))
392 if not is_id:
393 while '__' in result:
394 result = result.replace('__', '_')
395 result = result.strip('_')
396 # Common case of "Foreign band name - English song title"
397 if restricted and result.startswith('-_'):
398 result = result[2:]
399 if result.startswith('-'):
400 result = '_' + result[len('-'):]
401 result = result.lstrip('.')
402 if not result:
403 result = '_'
404 return result
405
406
407 def sanitize_path(s):
408 """Sanitizes and normalizes path on Windows"""
409 if sys.platform != 'win32':
410 return s
411 drive_or_unc, _ = os.path.splitdrive(s)
412 if sys.version_info < (2, 7) and not drive_or_unc:
413 drive_or_unc, _ = os.path.splitunc(s)
414 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
415 if drive_or_unc:
416 norm_path.pop(0)
417 sanitized_path = [
418 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
419 for path_part in norm_path]
420 if drive_or_unc:
421 sanitized_path.insert(0, drive_or_unc + os.path.sep)
422 return os.path.join(*sanitized_path)
423
424
425 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
426 # unwanted failures due to missing protocol
427 def sanitize_url(url):
428 return 'http:%s' % url if url.startswith('//') else url
429
430
431 def sanitized_Request(url, *args, **kwargs):
432 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
433
434
435 def orderedSet(iterable):
436 """ Remove all duplicates from the input iterable """
437 res = []
438 for el in iterable:
439 if el not in res:
440 res.append(el)
441 return res
442
443
444 def _htmlentity_transform(entity):
445 """Transforms an HTML entity to a character."""
446 # Known non-numeric HTML entity
447 if entity in compat_html_entities.name2codepoint:
448 return compat_chr(compat_html_entities.name2codepoint[entity])
449
450 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
451 if mobj is not None:
452 numstr = mobj.group(1)
453 if numstr.startswith('x'):
454 base = 16
455 numstr = '0%s' % numstr
456 else:
457 base = 10
458 # See https://github.com/rg3/youtube-dl/issues/7518
459 try:
460 return compat_chr(int(numstr, base))
461 except ValueError:
462 pass
463
464 # Unknown entity in name, return its literal representation
465 return '&%s;' % entity
466
467
468 def unescapeHTML(s):
469 if s is None:
470 return None
471 assert type(s) == compat_str
472
473 return re.sub(
474 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
475
476
477 def get_subprocess_encoding():
478 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
479 # For subprocess calls, encode with locale encoding
480 # Refer to http://stackoverflow.com/a/9951851/35070
481 encoding = preferredencoding()
482 else:
483 encoding = sys.getfilesystemencoding()
484 if encoding is None:
485 encoding = 'utf-8'
486 return encoding
487
488
489 def encodeFilename(s, for_subprocess=False):
490 """
491 @param s The name of the file
492 """
493
494 assert type(s) == compat_str
495
496 # Python 3 has a Unicode API
497 if sys.version_info >= (3, 0):
498 return s
499
500 # Pass '' directly to use Unicode APIs on Windows 2000 and up
501 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
502 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
503 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
504 return s
505
506 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
507 if sys.platform.startswith('java'):
508 return s
509
510 return s.encode(get_subprocess_encoding(), 'ignore')
511
512
513 def decodeFilename(b, for_subprocess=False):
514
515 if sys.version_info >= (3, 0):
516 return b
517
518 if not isinstance(b, bytes):
519 return b
520
521 return b.decode(get_subprocess_encoding(), 'ignore')
522
523
524 def encodeArgument(s):
525 if not isinstance(s, compat_str):
526 # Legacy code that uses byte strings
527 # Uncomment the following line after fixing all post processors
528 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
529 s = s.decode('ascii')
530 return encodeFilename(s, True)
531
532
533 def decodeArgument(b):
534 return decodeFilename(b, True)
535
536
537 def decodeOption(optval):
538 if optval is None:
539 return optval
540 if isinstance(optval, bytes):
541 optval = optval.decode(preferredencoding())
542
543 assert isinstance(optval, compat_str)
544 return optval
545
546
547 def formatSeconds(secs):
548 if secs > 3600:
549 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
550 elif secs > 60:
551 return '%d:%02d' % (secs // 60, secs % 60)
552 else:
553 return '%d' % secs
554
555
556 def make_HTTPS_handler(params, **kwargs):
557 opts_no_check_certificate = params.get('nocheckcertificate', False)
558 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
559 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
560 if opts_no_check_certificate:
561 context.check_hostname = False
562 context.verify_mode = ssl.CERT_NONE
563 try:
564 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
565 except TypeError:
566 # Python 2.7.8
567 # (create_default_context present but HTTPSHandler has no context=)
568 pass
569
570 if sys.version_info < (3, 2):
571 return YoutubeDLHTTPSHandler(params, **kwargs)
572 else: # Python < 3.4
573 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
574 context.verify_mode = (ssl.CERT_NONE
575 if opts_no_check_certificate
576 else ssl.CERT_REQUIRED)
577 context.set_default_verify_paths()
578 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
579
580
581 def bug_reports_message():
582 if ytdl_is_updateable():
583 update_cmd = 'type youtube-dl -U to update'
584 else:
585 update_cmd = 'see https://yt-dl.org/update on how to update'
586 msg = '; please report this issue on https://yt-dl.org/bug .'
587 msg += ' Make sure you are using the latest version; %s.' % update_cmd
588 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
589 return msg
590
591
592 class ExtractorError(Exception):
593 """Error during info extraction."""
594
595 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
596 """ tb, if given, is the original traceback (so that it can be printed out).
597 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
598 """
599
600 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
601 expected = True
602 if video_id is not None:
603 msg = video_id + ': ' + msg
604 if cause:
605 msg += ' (caused by %r)' % cause
606 if not expected:
607 msg += bug_reports_message()
608 super(ExtractorError, self).__init__(msg)
609
610 self.traceback = tb
611 self.exc_info = sys.exc_info() # preserve original exception
612 self.cause = cause
613 self.video_id = video_id
614
615 def format_traceback(self):
616 if self.traceback is None:
617 return None
618 return ''.join(traceback.format_tb(self.traceback))
619
620
621 class UnsupportedError(ExtractorError):
622 def __init__(self, url):
623 super(UnsupportedError, self).__init__(
624 'Unsupported URL: %s' % url, expected=True)
625 self.url = url
626
627
628 class RegexNotFoundError(ExtractorError):
629 """Error when a regex didn't match"""
630 pass
631
632
633 class DownloadError(Exception):
634 """Download Error exception.
635
636 This exception may be thrown by FileDownloader objects if they are not
637 configured to continue on errors. They will contain the appropriate
638 error message.
639 """
640
641 def __init__(self, msg, exc_info=None):
642 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
643 super(DownloadError, self).__init__(msg)
644 self.exc_info = exc_info
645
646
647 class SameFileError(Exception):
648 """Same File exception.
649
650 This exception will be thrown by FileDownloader objects if they detect
651 multiple files would have to be downloaded to the same file on disk.
652 """
653 pass
654
655
656 class PostProcessingError(Exception):
657 """Post Processing exception.
658
659 This exception may be raised by PostProcessor's .run() method to
660 indicate an error in the postprocessing task.
661 """
662
663 def __init__(self, msg):
664 self.msg = msg
665
666
667 class MaxDownloadsReached(Exception):
668 """ --max-downloads limit has been reached. """
669 pass
670
671
672 class UnavailableVideoError(Exception):
673 """Unavailable Format exception.
674
675 This exception will be thrown when a video is requested
676 in a format that is not available for that video.
677 """
678 pass
679
680
681 class ContentTooShortError(Exception):
682 """Content Too Short exception.
683
684 This exception may be raised by FileDownloader objects when a file they
685 download is too small for what the server announced first, indicating
686 the connection was probably interrupted.
687 """
688
689 def __init__(self, downloaded, expected):
690 # Both in bytes
691 self.downloaded = downloaded
692 self.expected = expected
693
694
695 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
696 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
697 # expected HTTP responses to meet HTTP/1.0 or later (see also
698 # https://github.com/rg3/youtube-dl/issues/6727)
699 if sys.version_info < (3, 0):
700 kwargs[b'strict'] = True
701 hc = http_class(*args, **kwargs)
702 source_address = ydl_handler._params.get('source_address')
703 if source_address is not None:
704 sa = (source_address, 0)
705 if hasattr(hc, 'source_address'): # Python 2.7+
706 hc.source_address = sa
707 else: # Python 2.6
708 def _hc_connect(self, *args, **kwargs):
709 sock = compat_socket_create_connection(
710 (self.host, self.port), self.timeout, sa)
711 if is_https:
712 self.sock = ssl.wrap_socket(
713 sock, self.key_file, self.cert_file,
714 ssl_version=ssl.PROTOCOL_TLSv1)
715 else:
716 self.sock = sock
717 hc.connect = functools.partial(_hc_connect, hc)
718
719 return hc
720
721
722 def handle_youtubedl_headers(headers):
723 filtered_headers = headers
724
725 if 'Youtubedl-no-compression' in filtered_headers:
726 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
727 del filtered_headers['Youtubedl-no-compression']
728
729 return filtered_headers
730
731
732 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
733 """Handler for HTTP requests and responses.
734
735 This class, when installed with an OpenerDirector, automatically adds
736 the standard headers to every HTTP request and handles gzipped and
737 deflated responses from web servers. If compression is to be avoided in
738 a particular request, the original request in the program code only has
739 to include the HTTP header "Youtubedl-no-compression", which will be
740 removed before making the real request.
741
742 Part of this code was copied from:
743
744 http://techknack.net/python-urllib2-handlers/
745
746 Andrew Rowls, the author of that code, agreed to release it to the
747 public domain.
748 """
749
750 def __init__(self, params, *args, **kwargs):
751 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
752 self._params = params
753
754 def http_open(self, req):
755 return self.do_open(functools.partial(
756 _create_http_connection, self, compat_http_client.HTTPConnection, False),
757 req)
758
759 @staticmethod
760 def deflate(data):
761 try:
762 return zlib.decompress(data, -zlib.MAX_WBITS)
763 except zlib.error:
764 return zlib.decompress(data)
765
766 @staticmethod
767 def addinfourl_wrapper(stream, headers, url, code):
768 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
769 return compat_urllib_request.addinfourl(stream, headers, url, code)
770 ret = compat_urllib_request.addinfourl(stream, headers, url)
771 ret.code = code
772 return ret
773
774 def http_request(self, req):
775 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
776 # always respected by websites, some tend to give out URLs with non percent-encoded
777 # non-ASCII characters (see telemb.py, ard.py [#3412])
778 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
779 # To work around aforementioned issue we will replace request's original URL with
780 # percent-encoded one
781 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
782 # the code of this workaround has been moved here from YoutubeDL.urlopen()
783 url = req.get_full_url()
784 url_escaped = escape_url(url)
785
786 # Substitute URL if any change after escaping
787 if url != url_escaped:
788 req = update_Request(req, url=url_escaped)
789
790 for h, v in std_headers.items():
791 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
792 # The dict keys are capitalized because of this bug by urllib
793 if h.capitalize() not in req.headers:
794 req.add_header(h, v)
795
796 req.headers = handle_youtubedl_headers(req.headers)
797
798 if sys.version_info < (2, 7) and '#' in req.get_full_url():
799 # Python 2.6 is brain-dead when it comes to fragments
800 req._Request__original = req._Request__original.partition('#')[0]
801 req._Request__r_type = req._Request__r_type.partition('#')[0]
802
803 return req
804
805 def http_response(self, req, resp):
806 old_resp = resp
807 # gzip
808 if resp.headers.get('Content-encoding', '') == 'gzip':
809 content = resp.read()
810 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
811 try:
812 uncompressed = io.BytesIO(gz.read())
813 except IOError as original_ioerror:
814 # There may be junk add the end of the file
815 # See http://stackoverflow.com/q/4928560/35070 for details
816 for i in range(1, 1024):
817 try:
818 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
819 uncompressed = io.BytesIO(gz.read())
820 except IOError:
821 continue
822 break
823 else:
824 raise original_ioerror
825 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
826 resp.msg = old_resp.msg
827 del resp.headers['Content-encoding']
828 # deflate
829 if resp.headers.get('Content-encoding', '') == 'deflate':
830 gz = io.BytesIO(self.deflate(resp.read()))
831 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
832 resp.msg = old_resp.msg
833 del resp.headers['Content-encoding']
834 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
835 # https://github.com/rg3/youtube-dl/issues/6457).
836 if 300 <= resp.code < 400:
837 location = resp.headers.get('Location')
838 if location:
839 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
840 if sys.version_info >= (3, 0):
841 location = location.encode('iso-8859-1').decode('utf-8')
842 location_escaped = escape_url(location)
843 if location != location_escaped:
844 del resp.headers['Location']
845 resp.headers['Location'] = location_escaped
846 return resp
847
848 https_request = http_request
849 https_response = http_response
850
851
852 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
853 def __init__(self, params, https_conn_class=None, *args, **kwargs):
854 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
855 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
856 self._params = params
857
858 def https_open(self, req):
859 kwargs = {}
860 if hasattr(self, '_context'): # python > 2.6
861 kwargs['context'] = self._context
862 if hasattr(self, '_check_hostname'): # python 3.x
863 kwargs['check_hostname'] = self._check_hostname
864 return self.do_open(functools.partial(
865 _create_http_connection, self, self._https_conn_class, True),
866 req, **kwargs)
867
868
869 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
870 def __init__(self, cookiejar=None):
871 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
872
873 def http_response(self, request, response):
874 # Python 2 will choke on next HTTP request in row if there are non-ASCII
875 # characters in Set-Cookie HTTP header of last response (see
876 # https://github.com/rg3/youtube-dl/issues/6769).
877 # In order to at least prevent crashing we will percent encode Set-Cookie
878 # header before HTTPCookieProcessor starts processing it.
879 # if sys.version_info < (3, 0) and response.headers:
880 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
881 # set_cookie = response.headers.get(set_cookie_header)
882 # if set_cookie:
883 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
884 # if set_cookie != set_cookie_escaped:
885 # del response.headers[set_cookie_header]
886 # response.headers[set_cookie_header] = set_cookie_escaped
887 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
888
889 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
890 https_response = http_response
891
892
893 def parse_iso8601(date_str, delimiter='T', timezone=None):
894 """ Return a UNIX timestamp from the given date """
895
896 if date_str is None:
897 return None
898
899 date_str = re.sub(r'\.[0-9]+', '', date_str)
900
901 if timezone is None:
902 m = re.search(
903 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
904 date_str)
905 if not m:
906 timezone = datetime.timedelta()
907 else:
908 date_str = date_str[:-len(m.group(0))]
909 if not m.group('sign'):
910 timezone = datetime.timedelta()
911 else:
912 sign = 1 if m.group('sign') == '+' else -1
913 timezone = datetime.timedelta(
914 hours=sign * int(m.group('hours')),
915 minutes=sign * int(m.group('minutes')))
916 try:
917 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
918 dt = datetime.datetime.strptime(date_str, date_format) - timezone
919 return calendar.timegm(dt.timetuple())
920 except ValueError:
921 pass
922
923
924 def unified_strdate(date_str, day_first=True):
925 """Return a string with the date in the format YYYYMMDD"""
926
927 if date_str is None:
928 return None
929 upload_date = None
930 # Replace commas
931 date_str = date_str.replace(',', ' ')
932 # %z (UTC offset) is only supported in python>=3.2
933 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
934 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
935 # Remove AM/PM + timezone
936 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
937
938 format_expressions = [
939 '%d %B %Y',
940 '%d %b %Y',
941 '%B %d %Y',
942 '%b %d %Y',
943 '%b %dst %Y %I:%M',
944 '%b %dnd %Y %I:%M',
945 '%b %dth %Y %I:%M',
946 '%Y %m %d',
947 '%Y-%m-%d',
948 '%Y/%m/%d',
949 '%Y/%m/%d %H:%M:%S',
950 '%Y-%m-%d %H:%M:%S',
951 '%Y-%m-%d %H:%M:%S.%f',
952 '%d.%m.%Y %H:%M',
953 '%d.%m.%Y %H.%M',
954 '%Y-%m-%dT%H:%M:%SZ',
955 '%Y-%m-%dT%H:%M:%S.%fZ',
956 '%Y-%m-%dT%H:%M:%S.%f0Z',
957 '%Y-%m-%dT%H:%M:%S',
958 '%Y-%m-%dT%H:%M:%S.%f',
959 '%Y-%m-%dT%H:%M',
960 ]
961 if day_first:
962 format_expressions.extend([
963 '%d-%m-%Y',
964 '%d.%m.%Y',
965 '%d/%m/%Y',
966 '%d/%m/%y',
967 '%d/%m/%Y %H:%M:%S',
968 ])
969 else:
970 format_expressions.extend([
971 '%m-%d-%Y',
972 '%m.%d.%Y',
973 '%m/%d/%Y',
974 '%m/%d/%y',
975 '%m/%d/%Y %H:%M:%S',
976 ])
977 for expression in format_expressions:
978 try:
979 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
980 except ValueError:
981 pass
982 if upload_date is None:
983 timetuple = email.utils.parsedate_tz(date_str)
984 if timetuple:
985 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
986 if upload_date is not None:
987 return compat_str(upload_date)
988
989
990 def determine_ext(url, default_ext='unknown_video'):
991 if url is None:
992 return default_ext
993 guess = url.partition('?')[0].rpartition('.')[2]
994 if re.match(r'^[A-Za-z0-9]+$', guess):
995 return guess
996 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
997 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
998 return guess.rstrip('/')
999 else:
1000 return default_ext
1001
1002
1003 def subtitles_filename(filename, sub_lang, sub_format):
1004 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1005
1006
1007 def date_from_str(date_str):
1008 """
1009 Return a datetime object from a string in the format YYYYMMDD or
1010 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1011 today = datetime.date.today()
1012 if date_str in ('now', 'today'):
1013 return today
1014 if date_str == 'yesterday':
1015 return today - datetime.timedelta(days=1)
1016 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1017 if match is not None:
1018 sign = match.group('sign')
1019 time = int(match.group('time'))
1020 if sign == '-':
1021 time = -time
1022 unit = match.group('unit')
1023 # A bad approximation?
1024 if unit == 'month':
1025 unit = 'day'
1026 time *= 30
1027 elif unit == 'year':
1028 unit = 'day'
1029 time *= 365
1030 unit += 's'
1031 delta = datetime.timedelta(**{unit: time})
1032 return today + delta
1033 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1034
1035
1036 def hyphenate_date(date_str):
1037 """
1038 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1039 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1040 if match is not None:
1041 return '-'.join(match.groups())
1042 else:
1043 return date_str
1044
1045
1046 class DateRange(object):
1047 """Represents a time interval between two dates"""
1048
1049 def __init__(self, start=None, end=None):
1050 """start and end must be strings in the format accepted by date"""
1051 if start is not None:
1052 self.start = date_from_str(start)
1053 else:
1054 self.start = datetime.datetime.min.date()
1055 if end is not None:
1056 self.end = date_from_str(end)
1057 else:
1058 self.end = datetime.datetime.max.date()
1059 if self.start > self.end:
1060 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1061
1062 @classmethod
1063 def day(cls, day):
1064 """Returns a range that only contains the given day"""
1065 return cls(day, day)
1066
1067 def __contains__(self, date):
1068 """Check if the date is in the range"""
1069 if not isinstance(date, datetime.date):
1070 date = date_from_str(date)
1071 return self.start <= date <= self.end
1072
1073 def __str__(self):
1074 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1075
1076
1077 def platform_name():
1078 """ Returns the platform name as a compat_str """
1079 res = platform.platform()
1080 if isinstance(res, bytes):
1081 res = res.decode(preferredencoding())
1082
1083 assert isinstance(res, compat_str)
1084 return res
1085
1086
1087 def _windows_write_string(s, out):
1088 """ Returns True if the string was written using special methods,
1089 False if it has yet to be written out."""
1090 # Adapted from http://stackoverflow.com/a/3259271/35070
1091
1092 import ctypes
1093 import ctypes.wintypes
1094
1095 WIN_OUTPUT_IDS = {
1096 1: -11,
1097 2: -12,
1098 }
1099
1100 try:
1101 fileno = out.fileno()
1102 except AttributeError:
1103 # If the output stream doesn't have a fileno, it's virtual
1104 return False
1105 except io.UnsupportedOperation:
1106 # Some strange Windows pseudo files?
1107 return False
1108 if fileno not in WIN_OUTPUT_IDS:
1109 return False
1110
1111 GetStdHandle = ctypes.WINFUNCTYPE(
1112 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1113 (b'GetStdHandle', ctypes.windll.kernel32))
1114 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1115
1116 WriteConsoleW = ctypes.WINFUNCTYPE(
1117 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1118 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1119 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1120 written = ctypes.wintypes.DWORD(0)
1121
1122 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1123 FILE_TYPE_CHAR = 0x0002
1124 FILE_TYPE_REMOTE = 0x8000
1125 GetConsoleMode = ctypes.WINFUNCTYPE(
1126 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1127 ctypes.POINTER(ctypes.wintypes.DWORD))(
1128 (b'GetConsoleMode', ctypes.windll.kernel32))
1129 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1130
1131 def not_a_console(handle):
1132 if handle == INVALID_HANDLE_VALUE or handle is None:
1133 return True
1134 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1135 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1136
1137 if not_a_console(h):
1138 return False
1139
1140 def next_nonbmp_pos(s):
1141 try:
1142 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1143 except StopIteration:
1144 return len(s)
1145
1146 while s:
1147 count = min(next_nonbmp_pos(s), 1024)
1148
1149 ret = WriteConsoleW(
1150 h, s, count if count else 2, ctypes.byref(written), None)
1151 if ret == 0:
1152 raise OSError('Failed to write string')
1153 if not count: # We just wrote a non-BMP character
1154 assert written.value == 2
1155 s = s[1:]
1156 else:
1157 assert written.value > 0
1158 s = s[written.value:]
1159 return True
1160
1161
1162 def write_string(s, out=None, encoding=None):
1163 if out is None:
1164 out = sys.stderr
1165 assert type(s) == compat_str
1166
1167 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1168 if _windows_write_string(s, out):
1169 return
1170
1171 if ('b' in getattr(out, 'mode', '') or
1172 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1173 byt = s.encode(encoding or preferredencoding(), 'ignore')
1174 out.write(byt)
1175 elif hasattr(out, 'buffer'):
1176 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1177 byt = s.encode(enc, 'ignore')
1178 out.buffer.write(byt)
1179 else:
1180 out.write(s)
1181 out.flush()
1182
1183
1184 def bytes_to_intlist(bs):
1185 if not bs:
1186 return []
1187 if isinstance(bs[0], int): # Python 3
1188 return list(bs)
1189 else:
1190 return [ord(c) for c in bs]
1191
1192
1193 def intlist_to_bytes(xs):
1194 if not xs:
1195 return b''
1196 return struct_pack('%dB' % len(xs), *xs)
1197
1198
1199 # Cross-platform file locking
1200 if sys.platform == 'win32':
1201 import ctypes.wintypes
1202 import msvcrt
1203
1204 class OVERLAPPED(ctypes.Structure):
1205 _fields_ = [
1206 ('Internal', ctypes.wintypes.LPVOID),
1207 ('InternalHigh', ctypes.wintypes.LPVOID),
1208 ('Offset', ctypes.wintypes.DWORD),
1209 ('OffsetHigh', ctypes.wintypes.DWORD),
1210 ('hEvent', ctypes.wintypes.HANDLE),
1211 ]
1212
1213 kernel32 = ctypes.windll.kernel32
1214 LockFileEx = kernel32.LockFileEx
1215 LockFileEx.argtypes = [
1216 ctypes.wintypes.HANDLE, # hFile
1217 ctypes.wintypes.DWORD, # dwFlags
1218 ctypes.wintypes.DWORD, # dwReserved
1219 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1220 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1221 ctypes.POINTER(OVERLAPPED) # Overlapped
1222 ]
1223 LockFileEx.restype = ctypes.wintypes.BOOL
1224 UnlockFileEx = kernel32.UnlockFileEx
1225 UnlockFileEx.argtypes = [
1226 ctypes.wintypes.HANDLE, # hFile
1227 ctypes.wintypes.DWORD, # dwReserved
1228 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1229 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1230 ctypes.POINTER(OVERLAPPED) # Overlapped
1231 ]
1232 UnlockFileEx.restype = ctypes.wintypes.BOOL
1233 whole_low = 0xffffffff
1234 whole_high = 0x7fffffff
1235
1236 def _lock_file(f, exclusive):
1237 overlapped = OVERLAPPED()
1238 overlapped.Offset = 0
1239 overlapped.OffsetHigh = 0
1240 overlapped.hEvent = 0
1241 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1242 handle = msvcrt.get_osfhandle(f.fileno())
1243 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1244 whole_low, whole_high, f._lock_file_overlapped_p):
1245 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1246
1247 def _unlock_file(f):
1248 assert f._lock_file_overlapped_p
1249 handle = msvcrt.get_osfhandle(f.fileno())
1250 if not UnlockFileEx(handle, 0,
1251 whole_low, whole_high, f._lock_file_overlapped_p):
1252 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1253
1254 else:
1255 # Some platforms, such as Jython, is missing fcntl
1256 try:
1257 import fcntl
1258
1259 def _lock_file(f, exclusive):
1260 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1261
1262 def _unlock_file(f):
1263 fcntl.flock(f, fcntl.LOCK_UN)
1264 except ImportError:
1265 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1266
1267 def _lock_file(f, exclusive):
1268 raise IOError(UNSUPPORTED_MSG)
1269
1270 def _unlock_file(f):
1271 raise IOError(UNSUPPORTED_MSG)
1272
1273
1274 class locked_file(object):
1275 def __init__(self, filename, mode, encoding=None):
1276 assert mode in ['r', 'a', 'w']
1277 self.f = io.open(filename, mode, encoding=encoding)
1278 self.mode = mode
1279
1280 def __enter__(self):
1281 exclusive = self.mode != 'r'
1282 try:
1283 _lock_file(self.f, exclusive)
1284 except IOError:
1285 self.f.close()
1286 raise
1287 return self
1288
1289 def __exit__(self, etype, value, traceback):
1290 try:
1291 _unlock_file(self.f)
1292 finally:
1293 self.f.close()
1294
1295 def __iter__(self):
1296 return iter(self.f)
1297
1298 def write(self, *args):
1299 return self.f.write(*args)
1300
1301 def read(self, *args):
1302 return self.f.read(*args)
1303
1304
1305 def get_filesystem_encoding():
1306 encoding = sys.getfilesystemencoding()
1307 return encoding if encoding is not None else 'utf-8'
1308
1309
1310 def shell_quote(args):
1311 quoted_args = []
1312 encoding = get_filesystem_encoding()
1313 for a in args:
1314 if isinstance(a, bytes):
1315 # We may get a filename encoded with 'encodeFilename'
1316 a = a.decode(encoding)
1317 quoted_args.append(pipes.quote(a))
1318 return ' '.join(quoted_args)
1319
1320
1321 def smuggle_url(url, data):
1322 """ Pass additional data in a URL for internal use. """
1323
1324 sdata = compat_urllib_parse_urlencode(
1325 {'__youtubedl_smuggle': json.dumps(data)})
1326 return url + '#' + sdata
1327
1328
1329 def unsmuggle_url(smug_url, default=None):
1330 if '#__youtubedl_smuggle' not in smug_url:
1331 return smug_url, default
1332 url, _, sdata = smug_url.rpartition('#')
1333 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1334 data = json.loads(jsond)
1335 return url, data
1336
1337
1338 def format_bytes(bytes):
1339 if bytes is None:
1340 return 'N/A'
1341 if type(bytes) is str:
1342 bytes = float(bytes)
1343 if bytes == 0.0:
1344 exponent = 0
1345 else:
1346 exponent = int(math.log(bytes, 1024.0))
1347 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1348 converted = float(bytes) / float(1024 ** exponent)
1349 return '%.2f%s' % (converted, suffix)
1350
1351
1352 def lookup_unit_table(unit_table, s):
1353 units_re = '|'.join(re.escape(u) for u in unit_table)
1354 m = re.match(
1355 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1356 if not m:
1357 return None
1358 num_str = m.group('num').replace(',', '.')
1359 mult = unit_table[m.group('unit')]
1360 return int(float(num_str) * mult)
1361
1362
1363 def parse_filesize(s):
1364 if s is None:
1365 return None
1366
1367 # The lower-case forms are of course incorrect and unofficial,
1368 # but we support those too
1369 _UNIT_TABLE = {
1370 'B': 1,
1371 'b': 1,
1372 'KiB': 1024,
1373 'KB': 1000,
1374 'kB': 1024,
1375 'Kb': 1000,
1376 'MiB': 1024 ** 2,
1377 'MB': 1000 ** 2,
1378 'mB': 1024 ** 2,
1379 'Mb': 1000 ** 2,
1380 'GiB': 1024 ** 3,
1381 'GB': 1000 ** 3,
1382 'gB': 1024 ** 3,
1383 'Gb': 1000 ** 3,
1384 'TiB': 1024 ** 4,
1385 'TB': 1000 ** 4,
1386 'tB': 1024 ** 4,
1387 'Tb': 1000 ** 4,
1388 'PiB': 1024 ** 5,
1389 'PB': 1000 ** 5,
1390 'pB': 1024 ** 5,
1391 'Pb': 1000 ** 5,
1392 'EiB': 1024 ** 6,
1393 'EB': 1000 ** 6,
1394 'eB': 1024 ** 6,
1395 'Eb': 1000 ** 6,
1396 'ZiB': 1024 ** 7,
1397 'ZB': 1000 ** 7,
1398 'zB': 1024 ** 7,
1399 'Zb': 1000 ** 7,
1400 'YiB': 1024 ** 8,
1401 'YB': 1000 ** 8,
1402 'yB': 1024 ** 8,
1403 'Yb': 1000 ** 8,
1404 }
1405
1406 return lookup_unit_table(_UNIT_TABLE, s)
1407
1408
1409 def parse_count(s):
1410 if s is None:
1411 return None
1412
1413 s = s.strip()
1414
1415 if re.match(r'^[\d,.]+$', s):
1416 return str_to_int(s)
1417
1418 _UNIT_TABLE = {
1419 'k': 1000,
1420 'K': 1000,
1421 'm': 1000 ** 2,
1422 'M': 1000 ** 2,
1423 'kk': 1000 ** 2,
1424 'KK': 1000 ** 2,
1425 }
1426
1427 return lookup_unit_table(_UNIT_TABLE, s)
1428
1429
1430 def month_by_name(name):
1431 """ Return the number of a month by (locale-independently) English name """
1432
1433 try:
1434 return ENGLISH_MONTH_NAMES.index(name) + 1
1435 except ValueError:
1436 return None
1437
1438
1439 def month_by_abbreviation(abbrev):
1440 """ Return the number of a month by (locale-independently) English
1441 abbreviations """
1442
1443 try:
1444 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1445 except ValueError:
1446 return None
1447
1448
1449 def fix_xml_ampersands(xml_str):
1450 """Replace all the '&' by '&amp;' in XML"""
1451 return re.sub(
1452 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1453 '&amp;',
1454 xml_str)
1455
1456
1457 def setproctitle(title):
1458 assert isinstance(title, compat_str)
1459
1460 # ctypes in Jython is not complete
1461 # http://bugs.jython.org/issue2148
1462 if sys.platform.startswith('java'):
1463 return
1464
1465 try:
1466 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1467 except OSError:
1468 return
1469 title_bytes = title.encode('utf-8')
1470 buf = ctypes.create_string_buffer(len(title_bytes))
1471 buf.value = title_bytes
1472 try:
1473 libc.prctl(15, buf, 0, 0, 0)
1474 except AttributeError:
1475 return # Strange libc, just skip this
1476
1477
1478 def remove_start(s, start):
1479 if s.startswith(start):
1480 return s[len(start):]
1481 return s
1482
1483
1484 def remove_end(s, end):
1485 if s.endswith(end):
1486 return s[:-len(end)]
1487 return s
1488
1489
1490 def remove_quotes(s):
1491 if s is None or len(s) < 2:
1492 return s
1493 for quote in ('"', "'", ):
1494 if s[0] == quote and s[-1] == quote:
1495 return s[1:-1]
1496 return s
1497
1498
1499 def url_basename(url):
1500 path = compat_urlparse.urlparse(url).path
1501 return path.strip('/').split('/')[-1]
1502
1503
1504 class HEADRequest(compat_urllib_request.Request):
1505 def get_method(self):
1506 return 'HEAD'
1507
1508
1509 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1510 if get_attr:
1511 if v is not None:
1512 v = getattr(v, get_attr, None)
1513 if v == '':
1514 v = None
1515 if v is None:
1516 return default
1517 try:
1518 return int(v) * invscale // scale
1519 except ValueError:
1520 return default
1521
1522
1523 def str_or_none(v, default=None):
1524 return default if v is None else compat_str(v)
1525
1526
1527 def str_to_int(int_str):
1528 """ A more relaxed version of int_or_none """
1529 if int_str is None:
1530 return None
1531 int_str = re.sub(r'[,\.\+]', '', int_str)
1532 return int(int_str)
1533
1534
1535 def float_or_none(v, scale=1, invscale=1, default=None):
1536 if v is None:
1537 return default
1538 try:
1539 return float(v) * invscale / scale
1540 except ValueError:
1541 return default
1542
1543
1544 def parse_duration(s):
1545 if not isinstance(s, compat_basestring):
1546 return None
1547
1548 s = s.strip()
1549
1550 days, hours, mins, secs, ms = [None] * 5
1551 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1552 if m:
1553 days, hours, mins, secs, ms = m.groups()
1554 else:
1555 m = re.match(
1556 r'''(?ix)(?:P?T)?
1557 (?:
1558 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1559 )?
1560 (?:
1561 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1562 )?
1563 (?:
1564 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1565 )?
1566 (?:
1567 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1568 )?$''', s)
1569 if m:
1570 days, hours, mins, secs, ms = m.groups()
1571 else:
1572 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1573 if m:
1574 hours, mins = m.groups()
1575 else:
1576 return None
1577
1578 duration = 0
1579 if secs:
1580 duration += float(secs)
1581 if mins:
1582 duration += float(mins) * 60
1583 if hours:
1584 duration += float(hours) * 60 * 60
1585 if days:
1586 duration += float(days) * 24 * 60 * 60
1587 if ms:
1588 duration += float(ms)
1589 return duration
1590
1591
1592 def prepend_extension(filename, ext, expected_real_ext=None):
1593 name, real_ext = os.path.splitext(filename)
1594 return (
1595 '{0}.{1}{2}'.format(name, ext, real_ext)
1596 if not expected_real_ext or real_ext[1:] == expected_real_ext
1597 else '{0}.{1}'.format(filename, ext))
1598
1599
1600 def replace_extension(filename, ext, expected_real_ext=None):
1601 name, real_ext = os.path.splitext(filename)
1602 return '{0}.{1}'.format(
1603 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1604 ext)
1605
1606
1607 def check_executable(exe, args=[]):
1608 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1609 args can be a list of arguments for a short output (like -version) """
1610 try:
1611 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1612 except OSError:
1613 return False
1614 return exe
1615
1616
1617 def get_exe_version(exe, args=['--version'],
1618 version_re=None, unrecognized='present'):
1619 """ Returns the version of the specified executable,
1620 or False if the executable is not present """
1621 try:
1622 out, _ = subprocess.Popen(
1623 [encodeArgument(exe)] + args,
1624 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1625 except OSError:
1626 return False
1627 if isinstance(out, bytes): # Python 2.x
1628 out = out.decode('ascii', 'ignore')
1629 return detect_exe_version(out, version_re, unrecognized)
1630
1631
1632 def detect_exe_version(output, version_re=None, unrecognized='present'):
1633 assert isinstance(output, compat_str)
1634 if version_re is None:
1635 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1636 m = re.search(version_re, output)
1637 if m:
1638 return m.group(1)
1639 else:
1640 return unrecognized
1641
1642
1643 class PagedList(object):
1644 def __len__(self):
1645 # This is only useful for tests
1646 return len(self.getslice())
1647
1648
1649 class OnDemandPagedList(PagedList):
1650 def __init__(self, pagefunc, pagesize, use_cache=False):
1651 self._pagefunc = pagefunc
1652 self._pagesize = pagesize
1653 self._use_cache = use_cache
1654 if use_cache:
1655 self._cache = {}
1656
1657 def getslice(self, start=0, end=None):
1658 res = []
1659 for pagenum in itertools.count(start // self._pagesize):
1660 firstid = pagenum * self._pagesize
1661 nextfirstid = pagenum * self._pagesize + self._pagesize
1662 if start >= nextfirstid:
1663 continue
1664
1665 page_results = None
1666 if self._use_cache:
1667 page_results = self._cache.get(pagenum)
1668 if page_results is None:
1669 page_results = list(self._pagefunc(pagenum))
1670 if self._use_cache:
1671 self._cache[pagenum] = page_results
1672
1673 startv = (
1674 start % self._pagesize
1675 if firstid <= start < nextfirstid
1676 else 0)
1677
1678 endv = (
1679 ((end - 1) % self._pagesize) + 1
1680 if (end is not None and firstid <= end <= nextfirstid)
1681 else None)
1682
1683 if startv != 0 or endv is not None:
1684 page_results = page_results[startv:endv]
1685 res.extend(page_results)
1686
1687 # A little optimization - if current page is not "full", ie. does
1688 # not contain page_size videos then we can assume that this page
1689 # is the last one - there are no more ids on further pages -
1690 # i.e. no need to query again.
1691 if len(page_results) + startv < self._pagesize:
1692 break
1693
1694 # If we got the whole page, but the next page is not interesting,
1695 # break out early as well
1696 if end == nextfirstid:
1697 break
1698 return res
1699
1700
1701 class InAdvancePagedList(PagedList):
1702 def __init__(self, pagefunc, pagecount, pagesize):
1703 self._pagefunc = pagefunc
1704 self._pagecount = pagecount
1705 self._pagesize = pagesize
1706
1707 def getslice(self, start=0, end=None):
1708 res = []
1709 start_page = start // self._pagesize
1710 end_page = (
1711 self._pagecount if end is None else (end // self._pagesize + 1))
1712 skip_elems = start - start_page * self._pagesize
1713 only_more = None if end is None else end - start
1714 for pagenum in range(start_page, end_page):
1715 page = list(self._pagefunc(pagenum))
1716 if skip_elems:
1717 page = page[skip_elems:]
1718 skip_elems = None
1719 if only_more is not None:
1720 if len(page) < only_more:
1721 only_more -= len(page)
1722 else:
1723 page = page[:only_more]
1724 res.extend(page)
1725 break
1726 res.extend(page)
1727 return res
1728
1729
1730 def uppercase_escape(s):
1731 unicode_escape = codecs.getdecoder('unicode_escape')
1732 return re.sub(
1733 r'\\U[0-9a-fA-F]{8}',
1734 lambda m: unicode_escape(m.group(0))[0],
1735 s)
1736
1737
1738 def lowercase_escape(s):
1739 unicode_escape = codecs.getdecoder('unicode_escape')
1740 return re.sub(
1741 r'\\u[0-9a-fA-F]{4}',
1742 lambda m: unicode_escape(m.group(0))[0],
1743 s)
1744
1745
1746 def escape_rfc3986(s):
1747 """Escape non-ASCII characters as suggested by RFC 3986"""
1748 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1749 s = s.encode('utf-8')
1750 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1751
1752
1753 def escape_url(url):
1754 """Escape URL as suggested by RFC 3986"""
1755 url_parsed = compat_urllib_parse_urlparse(url)
1756 return url_parsed._replace(
1757 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1758 path=escape_rfc3986(url_parsed.path),
1759 params=escape_rfc3986(url_parsed.params),
1760 query=escape_rfc3986(url_parsed.query),
1761 fragment=escape_rfc3986(url_parsed.fragment)
1762 ).geturl()
1763
1764 try:
1765 struct.pack('!I', 0)
1766 except TypeError:
1767 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1768 # See https://bugs.python.org/issue19099
1769 def struct_pack(spec, *args):
1770 if isinstance(spec, compat_str):
1771 spec = spec.encode('ascii')
1772 return struct.pack(spec, *args)
1773
1774 def struct_unpack(spec, *args):
1775 if isinstance(spec, compat_str):
1776 spec = spec.encode('ascii')
1777 return struct.unpack(spec, *args)
1778 else:
1779 struct_pack = struct.pack
1780 struct_unpack = struct.unpack
1781
1782
1783 def read_batch_urls(batch_fd):
1784 def fixup(url):
1785 if not isinstance(url, compat_str):
1786 url = url.decode('utf-8', 'replace')
1787 BOM_UTF8 = '\xef\xbb\xbf'
1788 if url.startswith(BOM_UTF8):
1789 url = url[len(BOM_UTF8):]
1790 url = url.strip()
1791 if url.startswith(('#', ';', ']')):
1792 return False
1793 return url
1794
1795 with contextlib.closing(batch_fd) as fd:
1796 return [url for url in map(fixup, fd) if url]
1797
1798
1799 def urlencode_postdata(*args, **kargs):
1800 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1801
1802
1803 def update_url_query(url, query):
1804 if not query:
1805 return url
1806 parsed_url = compat_urlparse.urlparse(url)
1807 qs = compat_parse_qs(parsed_url.query)
1808 qs.update(query)
1809 return compat_urlparse.urlunparse(parsed_url._replace(
1810 query=compat_urllib_parse_urlencode(qs, True)))
1811
1812
1813 def update_Request(req, url=None, data=None, headers={}, query={}):
1814 req_headers = req.headers.copy()
1815 req_headers.update(headers)
1816 req_data = data or req.data
1817 req_url = update_url_query(url or req.get_full_url(), query)
1818 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1819 new_req = req_type(
1820 req_url, data=req_data, headers=req_headers,
1821 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1822 if hasattr(req, 'timeout'):
1823 new_req.timeout = req.timeout
1824 return new_req
1825
1826
1827 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1828 if isinstance(key_or_keys, (list, tuple)):
1829 for key in key_or_keys:
1830 if key not in d or d[key] is None or skip_false_values and not d[key]:
1831 continue
1832 return d[key]
1833 return default
1834 return d.get(key_or_keys, default)
1835
1836
1837 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1838 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1839
1840
1841 US_RATINGS = {
1842 'G': 0,
1843 'PG': 10,
1844 'PG-13': 13,
1845 'R': 16,
1846 'NC': 18,
1847 }
1848
1849
1850 def parse_age_limit(s):
1851 if s is None:
1852 return None
1853 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1854 return int(m.group('age')) if m else US_RATINGS.get(s)
1855
1856
1857 def strip_jsonp(code):
1858 return re.sub(
1859 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1860
1861
1862 def js_to_json(code):
1863 def fix_kv(m):
1864 v = m.group(0)
1865 if v in ('true', 'false', 'null'):
1866 return v
1867 if v.startswith('"'):
1868 v = re.sub(r"\\'", "'", v[1:-1])
1869 elif v.startswith("'"):
1870 v = v[1:-1]
1871 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1872 '\\\\': '\\\\',
1873 "\\'": "'",
1874 '"': '\\"',
1875 }[m.group(0)], v)
1876 return '"%s"' % v
1877
1878 res = re.sub(r'''(?x)
1879 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1880 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1881 [a-zA-Z_][.a-zA-Z_0-9]*
1882 ''', fix_kv, code)
1883 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1884 return res
1885
1886
1887 def qualities(quality_ids):
1888 """ Get a numeric quality value out of a list of possible values """
1889 def q(qid):
1890 try:
1891 return quality_ids.index(qid)
1892 except ValueError:
1893 return -1
1894 return q
1895
1896
1897 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1898
1899
1900 def limit_length(s, length):
1901 """ Add ellipses to overly long strings """
1902 if s is None:
1903 return None
1904 ELLIPSES = '...'
1905 if len(s) > length:
1906 return s[:length - len(ELLIPSES)] + ELLIPSES
1907 return s
1908
1909
1910 def version_tuple(v):
1911 return tuple(int(e) for e in re.split(r'[-.]', v))
1912
1913
1914 def is_outdated_version(version, limit, assume_new=True):
1915 if not version:
1916 return not assume_new
1917 try:
1918 return version_tuple(version) < version_tuple(limit)
1919 except ValueError:
1920 return not assume_new
1921
1922
1923 def ytdl_is_updateable():
1924 """ Returns if youtube-dl can be updated with -U """
1925 from zipimport import zipimporter
1926
1927 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1928
1929
1930 def args_to_str(args):
1931 # Get a short string representation for a subprocess command
1932 return ' '.join(shlex_quote(a) for a in args)
1933
1934
1935 def error_to_compat_str(err):
1936 err_str = str(err)
1937 # On python 2 error byte string must be decoded with proper
1938 # encoding rather than ascii
1939 if sys.version_info[0] < 3:
1940 err_str = err_str.decode(preferredencoding())
1941 return err_str
1942
1943
1944 def mimetype2ext(mt):
1945 if mt is None:
1946 return None
1947
1948 ext = {
1949 'audio/mp4': 'm4a',
1950 }.get(mt)
1951 if ext is not None:
1952 return ext
1953
1954 _, _, res = mt.rpartition('/')
1955
1956 return {
1957 '3gpp': '3gp',
1958 'smptett+xml': 'tt',
1959 'srt': 'srt',
1960 'ttaf+xml': 'dfxp',
1961 'ttml+xml': 'ttml',
1962 'vtt': 'vtt',
1963 'x-flv': 'flv',
1964 'x-mp4-fragmented': 'mp4',
1965 'x-ms-wmv': 'wmv',
1966 }.get(res, res)
1967
1968
1969 def urlhandle_detect_ext(url_handle):
1970 try:
1971 url_handle.headers
1972 getheader = lambda h: url_handle.headers[h]
1973 except AttributeError: # Python < 3
1974 getheader = url_handle.info().getheader
1975
1976 cd = getheader('Content-Disposition')
1977 if cd:
1978 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1979 if m:
1980 e = determine_ext(m.group('filename'), default_ext=None)
1981 if e:
1982 return e
1983
1984 return mimetype2ext(getheader('Content-Type'))
1985
1986
1987 def encode_data_uri(data, mime_type):
1988 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1989
1990
1991 def age_restricted(content_limit, age_limit):
1992 """ Returns True iff the content should be blocked """
1993
1994 if age_limit is None: # No limit set
1995 return False
1996 if content_limit is None:
1997 return False # Content available for everyone
1998 return age_limit < content_limit
1999
2000
2001 def is_html(first_bytes):
2002 """ Detect whether a file contains HTML by examining its first bytes. """
2003
2004 BOMS = [
2005 (b'\xef\xbb\xbf', 'utf-8'),
2006 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2007 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2008 (b'\xff\xfe', 'utf-16-le'),
2009 (b'\xfe\xff', 'utf-16-be'),
2010 ]
2011 for bom, enc in BOMS:
2012 if first_bytes.startswith(bom):
2013 s = first_bytes[len(bom):].decode(enc, 'replace')
2014 break
2015 else:
2016 s = first_bytes.decode('utf-8', 'replace')
2017
2018 return re.match(r'^\s*<', s)
2019
2020
2021 def determine_protocol(info_dict):
2022 protocol = info_dict.get('protocol')
2023 if protocol is not None:
2024 return protocol
2025
2026 url = info_dict['url']
2027 if url.startswith('rtmp'):
2028 return 'rtmp'
2029 elif url.startswith('mms'):
2030 return 'mms'
2031 elif url.startswith('rtsp'):
2032 return 'rtsp'
2033
2034 ext = determine_ext(url)
2035 if ext == 'm3u8':
2036 return 'm3u8'
2037 elif ext == 'f4m':
2038 return 'f4m'
2039
2040 return compat_urllib_parse_urlparse(url).scheme
2041
2042
2043 def render_table(header_row, data):
2044 """ Render a list of rows, each as a list of values """
2045 table = [header_row] + data
2046 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2047 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2048 return '\n'.join(format_str % tuple(row) for row in table)
2049
2050
2051 def _match_one(filter_part, dct):
2052 COMPARISON_OPERATORS = {
2053 '<': operator.lt,
2054 '<=': operator.le,
2055 '>': operator.gt,
2056 '>=': operator.ge,
2057 '=': operator.eq,
2058 '!=': operator.ne,
2059 }
2060 operator_rex = re.compile(r'''(?x)\s*
2061 (?P<key>[a-z_]+)
2062 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2063 (?:
2064 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2065 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2066 )
2067 \s*$
2068 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2069 m = operator_rex.search(filter_part)
2070 if m:
2071 op = COMPARISON_OPERATORS[m.group('op')]
2072 if m.group('strval') is not None:
2073 if m.group('op') not in ('=', '!='):
2074 raise ValueError(
2075 'Operator %s does not support string values!' % m.group('op'))
2076 comparison_value = m.group('strval')
2077 else:
2078 try:
2079 comparison_value = int(m.group('intval'))
2080 except ValueError:
2081 comparison_value = parse_filesize(m.group('intval'))
2082 if comparison_value is None:
2083 comparison_value = parse_filesize(m.group('intval') + 'B')
2084 if comparison_value is None:
2085 raise ValueError(
2086 'Invalid integer value %r in filter part %r' % (
2087 m.group('intval'), filter_part))
2088 actual_value = dct.get(m.group('key'))
2089 if actual_value is None:
2090 return m.group('none_inclusive')
2091 return op(actual_value, comparison_value)
2092
2093 UNARY_OPERATORS = {
2094 '': lambda v: v is not None,
2095 '!': lambda v: v is None,
2096 }
2097 operator_rex = re.compile(r'''(?x)\s*
2098 (?P<op>%s)\s*(?P<key>[a-z_]+)
2099 \s*$
2100 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2101 m = operator_rex.search(filter_part)
2102 if m:
2103 op = UNARY_OPERATORS[m.group('op')]
2104 actual_value = dct.get(m.group('key'))
2105 return op(actual_value)
2106
2107 raise ValueError('Invalid filter part %r' % filter_part)
2108
2109
2110 def match_str(filter_str, dct):
2111 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2112
2113 return all(
2114 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2115
2116
2117 def match_filter_func(filter_str):
2118 def _match_func(info_dict):
2119 if match_str(filter_str, info_dict):
2120 return None
2121 else:
2122 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2123 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2124 return _match_func
2125
2126
2127 def parse_dfxp_time_expr(time_expr):
2128 if not time_expr:
2129 return
2130
2131 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2132 if mobj:
2133 return float(mobj.group('time_offset'))
2134
2135 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2136 if mobj:
2137 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2138
2139
2140 def srt_subtitles_timecode(seconds):
2141 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2142
2143
2144 def dfxp2srt(dfxp_data):
2145 _x = functools.partial(xpath_with_ns, ns_map={
2146 'ttml': 'http://www.w3.org/ns/ttml',
2147 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2148 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2149 })
2150
2151 class TTMLPElementParser(object):
2152 out = ''
2153
2154 def start(self, tag, attrib):
2155 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2156 self.out += '\n'
2157
2158 def end(self, tag):
2159 pass
2160
2161 def data(self, data):
2162 self.out += data
2163
2164 def close(self):
2165 return self.out.strip()
2166
2167 def parse_node(node):
2168 target = TTMLPElementParser()
2169 parser = xml.etree.ElementTree.XMLParser(target=target)
2170 parser.feed(xml.etree.ElementTree.tostring(node))
2171 return parser.close()
2172
2173 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2174 out = []
2175 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2176
2177 if not paras:
2178 raise ValueError('Invalid dfxp/TTML subtitle')
2179
2180 for para, index in zip(paras, itertools.count(1)):
2181 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2182 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2183 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2184 if begin_time is None:
2185 continue
2186 if not end_time:
2187 if not dur:
2188 continue
2189 end_time = begin_time + dur
2190 out.append('%d\n%s --> %s\n%s\n\n' % (
2191 index,
2192 srt_subtitles_timecode(begin_time),
2193 srt_subtitles_timecode(end_time),
2194 parse_node(para)))
2195
2196 return ''.join(out)
2197
2198
2199 def cli_option(params, command_option, param):
2200 param = params.get(param)
2201 return [command_option, param] if param is not None else []
2202
2203
2204 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2205 param = params.get(param)
2206 assert isinstance(param, bool)
2207 if separator:
2208 return [command_option + separator + (true_value if param else false_value)]
2209 return [command_option, true_value if param else false_value]
2210
2211
2212 def cli_valueless_option(params, command_option, param, expected_value=True):
2213 param = params.get(param)
2214 return [command_option] if param == expected_value else []
2215
2216
2217 def cli_configuration_args(params, param, default=[]):
2218 ex_args = params.get(param)
2219 if ex_args is None:
2220 return default
2221 assert isinstance(ex_args, list)
2222 return ex_args
2223
2224
2225 class ISO639Utils(object):
2226 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2227 _lang_map = {
2228 'aa': 'aar',
2229 'ab': 'abk',
2230 'ae': 'ave',
2231 'af': 'afr',
2232 'ak': 'aka',
2233 'am': 'amh',
2234 'an': 'arg',
2235 'ar': 'ara',
2236 'as': 'asm',
2237 'av': 'ava',
2238 'ay': 'aym',
2239 'az': 'aze',
2240 'ba': 'bak',
2241 'be': 'bel',
2242 'bg': 'bul',
2243 'bh': 'bih',
2244 'bi': 'bis',
2245 'bm': 'bam',
2246 'bn': 'ben',
2247 'bo': 'bod',
2248 'br': 'bre',
2249 'bs': 'bos',
2250 'ca': 'cat',
2251 'ce': 'che',
2252 'ch': 'cha',
2253 'co': 'cos',
2254 'cr': 'cre',
2255 'cs': 'ces',
2256 'cu': 'chu',
2257 'cv': 'chv',
2258 'cy': 'cym',
2259 'da': 'dan',
2260 'de': 'deu',
2261 'dv': 'div',
2262 'dz': 'dzo',
2263 'ee': 'ewe',
2264 'el': 'ell',
2265 'en': 'eng',
2266 'eo': 'epo',
2267 'es': 'spa',
2268 'et': 'est',
2269 'eu': 'eus',
2270 'fa': 'fas',
2271 'ff': 'ful',
2272 'fi': 'fin',
2273 'fj': 'fij',
2274 'fo': 'fao',
2275 'fr': 'fra',
2276 'fy': 'fry',
2277 'ga': 'gle',
2278 'gd': 'gla',
2279 'gl': 'glg',
2280 'gn': 'grn',
2281 'gu': 'guj',
2282 'gv': 'glv',
2283 'ha': 'hau',
2284 'he': 'heb',
2285 'hi': 'hin',
2286 'ho': 'hmo',
2287 'hr': 'hrv',
2288 'ht': 'hat',
2289 'hu': 'hun',
2290 'hy': 'hye',
2291 'hz': 'her',
2292 'ia': 'ina',
2293 'id': 'ind',
2294 'ie': 'ile',
2295 'ig': 'ibo',
2296 'ii': 'iii',
2297 'ik': 'ipk',
2298 'io': 'ido',
2299 'is': 'isl',
2300 'it': 'ita',
2301 'iu': 'iku',
2302 'ja': 'jpn',
2303 'jv': 'jav',
2304 'ka': 'kat',
2305 'kg': 'kon',
2306 'ki': 'kik',
2307 'kj': 'kua',
2308 'kk': 'kaz',
2309 'kl': 'kal',
2310 'km': 'khm',
2311 'kn': 'kan',
2312 'ko': 'kor',
2313 'kr': 'kau',
2314 'ks': 'kas',
2315 'ku': 'kur',
2316 'kv': 'kom',
2317 'kw': 'cor',
2318 'ky': 'kir',
2319 'la': 'lat',
2320 'lb': 'ltz',
2321 'lg': 'lug',
2322 'li': 'lim',
2323 'ln': 'lin',
2324 'lo': 'lao',
2325 'lt': 'lit',
2326 'lu': 'lub',
2327 'lv': 'lav',
2328 'mg': 'mlg',
2329 'mh': 'mah',
2330 'mi': 'mri',
2331 'mk': 'mkd',
2332 'ml': 'mal',
2333 'mn': 'mon',
2334 'mr': 'mar',
2335 'ms': 'msa',
2336 'mt': 'mlt',
2337 'my': 'mya',
2338 'na': 'nau',
2339 'nb': 'nob',
2340 'nd': 'nde',
2341 'ne': 'nep',
2342 'ng': 'ndo',
2343 'nl': 'nld',
2344 'nn': 'nno',
2345 'no': 'nor',
2346 'nr': 'nbl',
2347 'nv': 'nav',
2348 'ny': 'nya',
2349 'oc': 'oci',
2350 'oj': 'oji',
2351 'om': 'orm',
2352 'or': 'ori',
2353 'os': 'oss',
2354 'pa': 'pan',
2355 'pi': 'pli',
2356 'pl': 'pol',
2357 'ps': 'pus',
2358 'pt': 'por',
2359 'qu': 'que',
2360 'rm': 'roh',
2361 'rn': 'run',
2362 'ro': 'ron',
2363 'ru': 'rus',
2364 'rw': 'kin',
2365 'sa': 'san',
2366 'sc': 'srd',
2367 'sd': 'snd',
2368 'se': 'sme',
2369 'sg': 'sag',
2370 'si': 'sin',
2371 'sk': 'slk',
2372 'sl': 'slv',
2373 'sm': 'smo',
2374 'sn': 'sna',
2375 'so': 'som',
2376 'sq': 'sqi',
2377 'sr': 'srp',
2378 'ss': 'ssw',
2379 'st': 'sot',
2380 'su': 'sun',
2381 'sv': 'swe',
2382 'sw': 'swa',
2383 'ta': 'tam',
2384 'te': 'tel',
2385 'tg': 'tgk',
2386 'th': 'tha',
2387 'ti': 'tir',
2388 'tk': 'tuk',
2389 'tl': 'tgl',
2390 'tn': 'tsn',
2391 'to': 'ton',
2392 'tr': 'tur',
2393 'ts': 'tso',
2394 'tt': 'tat',
2395 'tw': 'twi',
2396 'ty': 'tah',
2397 'ug': 'uig',
2398 'uk': 'ukr',
2399 'ur': 'urd',
2400 'uz': 'uzb',
2401 've': 'ven',
2402 'vi': 'vie',
2403 'vo': 'vol',
2404 'wa': 'wln',
2405 'wo': 'wol',
2406 'xh': 'xho',
2407 'yi': 'yid',
2408 'yo': 'yor',
2409 'za': 'zha',
2410 'zh': 'zho',
2411 'zu': 'zul',
2412 }
2413
2414 @classmethod
2415 def short2long(cls, code):
2416 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2417 return cls._lang_map.get(code[:2])
2418
2419 @classmethod
2420 def long2short(cls, code):
2421 """Convert language code from ISO 639-2/T to ISO 639-1"""
2422 for short_name, long_name in cls._lang_map.items():
2423 if long_name == code:
2424 return short_name
2425
2426
2427 class ISO3166Utils(object):
2428 # From http://data.okfn.org/data/core/country-list
2429 _country_map = {
2430 'AF': 'Afghanistan',
2431 'AX': 'Åland Islands',
2432 'AL': 'Albania',
2433 'DZ': 'Algeria',
2434 'AS': 'American Samoa',
2435 'AD': 'Andorra',
2436 'AO': 'Angola',
2437 'AI': 'Anguilla',
2438 'AQ': 'Antarctica',
2439 'AG': 'Antigua and Barbuda',
2440 'AR': 'Argentina',
2441 'AM': 'Armenia',
2442 'AW': 'Aruba',
2443 'AU': 'Australia',
2444 'AT': 'Austria',
2445 'AZ': 'Azerbaijan',
2446 'BS': 'Bahamas',
2447 'BH': 'Bahrain',
2448 'BD': 'Bangladesh',
2449 'BB': 'Barbados',
2450 'BY': 'Belarus',
2451 'BE': 'Belgium',
2452 'BZ': 'Belize',
2453 'BJ': 'Benin',
2454 'BM': 'Bermuda',
2455 'BT': 'Bhutan',
2456 'BO': 'Bolivia, Plurinational State of',
2457 'BQ': 'Bonaire, Sint Eustatius and Saba',
2458 'BA': 'Bosnia and Herzegovina',
2459 'BW': 'Botswana',
2460 'BV': 'Bouvet Island',
2461 'BR': 'Brazil',
2462 'IO': 'British Indian Ocean Territory',
2463 'BN': 'Brunei Darussalam',
2464 'BG': 'Bulgaria',
2465 'BF': 'Burkina Faso',
2466 'BI': 'Burundi',
2467 'KH': 'Cambodia',
2468 'CM': 'Cameroon',
2469 'CA': 'Canada',
2470 'CV': 'Cape Verde',
2471 'KY': 'Cayman Islands',
2472 'CF': 'Central African Republic',
2473 'TD': 'Chad',
2474 'CL': 'Chile',
2475 'CN': 'China',
2476 'CX': 'Christmas Island',
2477 'CC': 'Cocos (Keeling) Islands',
2478 'CO': 'Colombia',
2479 'KM': 'Comoros',
2480 'CG': 'Congo',
2481 'CD': 'Congo, the Democratic Republic of the',
2482 'CK': 'Cook Islands',
2483 'CR': 'Costa Rica',
2484 'CI': 'Côte d\'Ivoire',
2485 'HR': 'Croatia',
2486 'CU': 'Cuba',
2487 'CW': 'Curaçao',
2488 'CY': 'Cyprus',
2489 'CZ': 'Czech Republic',
2490 'DK': 'Denmark',
2491 'DJ': 'Djibouti',
2492 'DM': 'Dominica',
2493 'DO': 'Dominican Republic',
2494 'EC': 'Ecuador',
2495 'EG': 'Egypt',
2496 'SV': 'El Salvador',
2497 'GQ': 'Equatorial Guinea',
2498 'ER': 'Eritrea',
2499 'EE': 'Estonia',
2500 'ET': 'Ethiopia',
2501 'FK': 'Falkland Islands (Malvinas)',
2502 'FO': 'Faroe Islands',
2503 'FJ': 'Fiji',
2504 'FI': 'Finland',
2505 'FR': 'France',
2506 'GF': 'French Guiana',
2507 'PF': 'French Polynesia',
2508 'TF': 'French Southern Territories',
2509 'GA': 'Gabon',
2510 'GM': 'Gambia',
2511 'GE': 'Georgia',
2512 'DE': 'Germany',
2513 'GH': 'Ghana',
2514 'GI': 'Gibraltar',
2515 'GR': 'Greece',
2516 'GL': 'Greenland',
2517 'GD': 'Grenada',
2518 'GP': 'Guadeloupe',
2519 'GU': 'Guam',
2520 'GT': 'Guatemala',
2521 'GG': 'Guernsey',
2522 'GN': 'Guinea',
2523 'GW': 'Guinea-Bissau',
2524 'GY': 'Guyana',
2525 'HT': 'Haiti',
2526 'HM': 'Heard Island and McDonald Islands',
2527 'VA': 'Holy See (Vatican City State)',
2528 'HN': 'Honduras',
2529 'HK': 'Hong Kong',
2530 'HU': 'Hungary',
2531 'IS': 'Iceland',
2532 'IN': 'India',
2533 'ID': 'Indonesia',
2534 'IR': 'Iran, Islamic Republic of',
2535 'IQ': 'Iraq',
2536 'IE': 'Ireland',
2537 'IM': 'Isle of Man',
2538 'IL': 'Israel',
2539 'IT': 'Italy',
2540 'JM': 'Jamaica',
2541 'JP': 'Japan',
2542 'JE': 'Jersey',
2543 'JO': 'Jordan',
2544 'KZ': 'Kazakhstan',
2545 'KE': 'Kenya',
2546 'KI': 'Kiribati',
2547 'KP': 'Korea, Democratic People\'s Republic of',
2548 'KR': 'Korea, Republic of',
2549 'KW': 'Kuwait',
2550 'KG': 'Kyrgyzstan',
2551 'LA': 'Lao People\'s Democratic Republic',
2552 'LV': 'Latvia',
2553 'LB': 'Lebanon',
2554 'LS': 'Lesotho',
2555 'LR': 'Liberia',
2556 'LY': 'Libya',
2557 'LI': 'Liechtenstein',
2558 'LT': 'Lithuania',
2559 'LU': 'Luxembourg',
2560 'MO': 'Macao',
2561 'MK': 'Macedonia, the Former Yugoslav Republic of',
2562 'MG': 'Madagascar',
2563 'MW': 'Malawi',
2564 'MY': 'Malaysia',
2565 'MV': 'Maldives',
2566 'ML': 'Mali',
2567 'MT': 'Malta',
2568 'MH': 'Marshall Islands',
2569 'MQ': 'Martinique',
2570 'MR': 'Mauritania',
2571 'MU': 'Mauritius',
2572 'YT': 'Mayotte',
2573 'MX': 'Mexico',
2574 'FM': 'Micronesia, Federated States of',
2575 'MD': 'Moldova, Republic of',
2576 'MC': 'Monaco',
2577 'MN': 'Mongolia',
2578 'ME': 'Montenegro',
2579 'MS': 'Montserrat',
2580 'MA': 'Morocco',
2581 'MZ': 'Mozambique',
2582 'MM': 'Myanmar',
2583 'NA': 'Namibia',
2584 'NR': 'Nauru',
2585 'NP': 'Nepal',
2586 'NL': 'Netherlands',
2587 'NC': 'New Caledonia',
2588 'NZ': 'New Zealand',
2589 'NI': 'Nicaragua',
2590 'NE': 'Niger',
2591 'NG': 'Nigeria',
2592 'NU': 'Niue',
2593 'NF': 'Norfolk Island',
2594 'MP': 'Northern Mariana Islands',
2595 'NO': 'Norway',
2596 'OM': 'Oman',
2597 'PK': 'Pakistan',
2598 'PW': 'Palau',
2599 'PS': 'Palestine, State of',
2600 'PA': 'Panama',
2601 'PG': 'Papua New Guinea',
2602 'PY': 'Paraguay',
2603 'PE': 'Peru',
2604 'PH': 'Philippines',
2605 'PN': 'Pitcairn',
2606 'PL': 'Poland',
2607 'PT': 'Portugal',
2608 'PR': 'Puerto Rico',
2609 'QA': 'Qatar',
2610 'RE': 'Réunion',
2611 'RO': 'Romania',
2612 'RU': 'Russian Federation',
2613 'RW': 'Rwanda',
2614 'BL': 'Saint Barthélemy',
2615 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2616 'KN': 'Saint Kitts and Nevis',
2617 'LC': 'Saint Lucia',
2618 'MF': 'Saint Martin (French part)',
2619 'PM': 'Saint Pierre and Miquelon',
2620 'VC': 'Saint Vincent and the Grenadines',
2621 'WS': 'Samoa',
2622 'SM': 'San Marino',
2623 'ST': 'Sao Tome and Principe',
2624 'SA': 'Saudi Arabia',
2625 'SN': 'Senegal',
2626 'RS': 'Serbia',
2627 'SC': 'Seychelles',
2628 'SL': 'Sierra Leone',
2629 'SG': 'Singapore',
2630 'SX': 'Sint Maarten (Dutch part)',
2631 'SK': 'Slovakia',
2632 'SI': 'Slovenia',
2633 'SB': 'Solomon Islands',
2634 'SO': 'Somalia',
2635 'ZA': 'South Africa',
2636 'GS': 'South Georgia and the South Sandwich Islands',
2637 'SS': 'South Sudan',
2638 'ES': 'Spain',
2639 'LK': 'Sri Lanka',
2640 'SD': 'Sudan',
2641 'SR': 'Suriname',
2642 'SJ': 'Svalbard and Jan Mayen',
2643 'SZ': 'Swaziland',
2644 'SE': 'Sweden',
2645 'CH': 'Switzerland',
2646 'SY': 'Syrian Arab Republic',
2647 'TW': 'Taiwan, Province of China',
2648 'TJ': 'Tajikistan',
2649 'TZ': 'Tanzania, United Republic of',
2650 'TH': 'Thailand',
2651 'TL': 'Timor-Leste',
2652 'TG': 'Togo',
2653 'TK': 'Tokelau',
2654 'TO': 'Tonga',
2655 'TT': 'Trinidad and Tobago',
2656 'TN': 'Tunisia',
2657 'TR': 'Turkey',
2658 'TM': 'Turkmenistan',
2659 'TC': 'Turks and Caicos Islands',
2660 'TV': 'Tuvalu',
2661 'UG': 'Uganda',
2662 'UA': 'Ukraine',
2663 'AE': 'United Arab Emirates',
2664 'GB': 'United Kingdom',
2665 'US': 'United States',
2666 'UM': 'United States Minor Outlying Islands',
2667 'UY': 'Uruguay',
2668 'UZ': 'Uzbekistan',
2669 'VU': 'Vanuatu',
2670 'VE': 'Venezuela, Bolivarian Republic of',
2671 'VN': 'Viet Nam',
2672 'VG': 'Virgin Islands, British',
2673 'VI': 'Virgin Islands, U.S.',
2674 'WF': 'Wallis and Futuna',
2675 'EH': 'Western Sahara',
2676 'YE': 'Yemen',
2677 'ZM': 'Zambia',
2678 'ZW': 'Zimbabwe',
2679 }
2680
2681 @classmethod
2682 def short2full(cls, code):
2683 """Convert an ISO 3166-2 country code to the corresponding full name"""
2684 return cls._country_map.get(code.upper())
2685
2686
2687 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2688 def __init__(self, proxies=None):
2689 # Set default handlers
2690 for type in ('http', 'https'):
2691 setattr(self, '%s_open' % type,
2692 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2693 meth(r, proxy, type))
2694 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2695
2696 def proxy_open(self, req, proxy, type):
2697 req_proxy = req.headers.get('Ytdl-request-proxy')
2698 if req_proxy is not None:
2699 proxy = req_proxy
2700 del req.headers['Ytdl-request-proxy']
2701
2702 if proxy == '__noproxy__':
2703 return None # No Proxy
2704 return compat_urllib_request.ProxyHandler.proxy_open(
2705 self, req, proxy, type)
2706
2707
2708 def ohdave_rsa_encrypt(data, exponent, modulus):
2709 '''
2710 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2711
2712 Input:
2713 data: data to encrypt, bytes-like object
2714 exponent, modulus: parameter e and N of RSA algorithm, both integer
2715 Output: hex string of encrypted data
2716
2717 Limitation: supports one block encryption only
2718 '''
2719
2720 payload = int(binascii.hexlify(data[::-1]), 16)
2721 encrypted = pow(payload, exponent, modulus)
2722 return '%x' % encrypted
2723
2724
2725 def encode_base_n(num, n, table=None):
2726 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2727 if not table:
2728 table = FULL_TABLE[:n]
2729
2730 if n > len(table):
2731 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2732
2733 if num == 0:
2734 return table[0]
2735
2736 ret = ''
2737 while num:
2738 ret = table[num % n] + ret
2739 num = num // n
2740 return ret
2741
2742
2743 def decode_packed_codes(code):
2744 mobj = re.search(
2745 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2746 code)
2747 obfucasted_code, base, count, symbols = mobj.groups()
2748 base = int(base)
2749 count = int(count)
2750 symbols = symbols.split('|')
2751 symbol_table = {}
2752
2753 while count:
2754 count -= 1
2755 base_n_count = encode_base_n(count, base)
2756 symbol_table[base_n_count] = symbols[count] or base_n_count
2757
2758 return re.sub(
2759 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2760 obfucasted_code)