]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Merge branch 'master' of github.com:rg3/youtube-dl
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import calendar
5 import codecs
6 import contextlib
7 import ctypes
8 import datetime
9 import email.utils
10 import errno
11 import getpass
12 import gzip
13 import itertools
14 import io
15 import json
16 import locale
17 import math
18 import os
19 import pipes
20 import platform
21 import re
22 import ssl
23 import socket
24 import struct
25 import subprocess
26 import sys
27 import traceback
28 import xml.etree.ElementTree
29 import zlib
30
31 try:
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
35
36 try:
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
40
41 try:
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
45
46 try:
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
50
51 try:
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
55
56 try:
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
60
61 try:
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
65
66 try:
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
70
71 try:
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
75
76 try:
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
80
81 try:
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
85
86
87 try:
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90 except ImportError:
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
92
93 try:
94 from urllib.parse import parse_qs as compat_parse_qs
95 except ImportError: # Python 2
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string, encoding='utf-8', errors='replace'):
99 if string == '':
100 return string
101 res = string.split('%')
102 if len(res) == 1:
103 return string
104 if encoding is None:
105 encoding = 'utf-8'
106 if errors is None:
107 errors = 'replace'
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
109 pct_sequence = b''
110 string = res[0]
111 for item in res[1:]:
112 try:
113 if not item:
114 raise ValueError
115 pct_sequence += item[:2].decode('hex')
116 rest = item[2:]
117 if not rest:
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
121 continue
122 except ValueError:
123 rest = '%' + item
124 # Encountered non-percent-encoded characters. Flush the current
125 # pct_sequence.
126 string += pct_sequence.decode(encoding, errors) + rest
127 pct_sequence = b''
128 if pct_sequence:
129 # Flush the final pct_sequence
130 string += pct_sequence.decode(encoding, errors)
131 return string
132
133 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
134 encoding='utf-8', errors='replace'):
135 qs, _coerce_result = qs, unicode
136 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
137 r = []
138 for name_value in pairs:
139 if not name_value and not strict_parsing:
140 continue
141 nv = name_value.split('=', 1)
142 if len(nv) != 2:
143 if strict_parsing:
144 raise ValueError("bad query field: %r" % (name_value,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values:
147 nv.append('')
148 else:
149 continue
150 if len(nv[1]) or keep_blank_values:
151 name = nv[0].replace('+', ' ')
152 name = _unquote(name, encoding=encoding, errors=errors)
153 name = _coerce_result(name)
154 value = nv[1].replace('+', ' ')
155 value = _unquote(value, encoding=encoding, errors=errors)
156 value = _coerce_result(value)
157 r.append((name, value))
158 return r
159
160 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
161 encoding='utf-8', errors='replace'):
162 parsed_result = {}
163 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
164 encoding=encoding, errors=errors)
165 for name, value in pairs:
166 if name in parsed_result:
167 parsed_result[name].append(value)
168 else:
169 parsed_result[name] = [value]
170 return parsed_result
171
172 try:
173 compat_str = unicode # Python 2
174 except NameError:
175 compat_str = str
176
177 try:
178 compat_chr = unichr # Python 2
179 except NameError:
180 compat_chr = chr
181
182 try:
183 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
184 except ImportError: # Python 2.6
185 from xml.parsers.expat import ExpatError as compat_xml_parse_error
186
187 def compat_ord(c):
188 if type(c) is int: return c
189 else: return ord(c)
190
191 # This is not clearly defined otherwise
192 compiled_regex_type = type(re.compile(''))
193
194 std_headers = {
195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
200 }
201
202 def preferredencoding():
203 """Get preferred encoding.
204
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
207 """
208 try:
209 pref = locale.getpreferredencoding()
210 u'TEST'.encode(pref)
211 except:
212 pref = 'UTF-8'
213
214 return pref
215
216 if sys.version_info < (3,0):
217 def compat_print(s):
218 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
219 else:
220 def compat_print(s):
221 assert type(s) == type(u'')
222 print(s)
223
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3,0):
227 def write_json_file(obj, fn):
228 with open(fn, 'wb') as f:
229 json.dump(obj, f)
230 else:
231 def write_json_file(obj, fn):
232 with open(fn, 'w', encoding='utf-8') as f:
233 json.dump(obj, f)
234
235 if sys.version_info >= (2,7):
236 def find_xpath_attr(node, xpath, key, val):
237 """ Find the xpath xpath[@key=val] """
238 assert re.match(r'^[a-zA-Z]+$', key)
239 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
240 expr = xpath + u"[@%s='%s']" % (key, val)
241 return node.find(expr)
242 else:
243 def find_xpath_attr(node, xpath, key, val):
244 for f in node.findall(xpath):
245 if f.attrib.get(key) == val:
246 return f
247 return None
248
249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
250 # the namespace parameter
251 def xpath_with_ns(path, ns_map):
252 components = [c.split(':') for c in path.split('/')]
253 replaced = []
254 for c in components:
255 if len(c) == 1:
256 replaced.append(c[0])
257 else:
258 ns, tag = c
259 replaced.append('{%s}%s' % (ns_map[ns], tag))
260 return '/'.join(replaced)
261
262 def htmlentity_transform(matchobj):
263 """Transforms an HTML entity to a character.
264
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
267 """
268 entity = matchobj.group(1)
269
270 # Known non-numeric HTML entity
271 if entity in compat_html_entities.name2codepoint:
272 return compat_chr(compat_html_entities.name2codepoint[entity])
273
274 mobj = re.match(u'(?u)#(x?\\d+)', entity)
275 if mobj is not None:
276 numstr = mobj.group(1)
277 if numstr.startswith(u'x'):
278 base = 16
279 numstr = u'0%s' % numstr
280 else:
281 base = 10
282 return compat_chr(int(numstr, base))
283
284 # Unknown entity in name, return its literal representation
285 return (u'&%s;' % entity)
286
287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
288 class BaseHTMLParser(compat_html_parser.HTMLParser):
289 def __init(self):
290 compat_html_parser.HTMLParser.__init__(self)
291 self.html = None
292
293 def loads(self, html):
294 self.html = html
295 self.feed(html)
296 self.close()
297
298 class AttrParser(BaseHTMLParser):
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self, attribute, value):
301 self.attribute = attribute
302 self.value = value
303 self.result = None
304 self.started = False
305 self.depth = {}
306 self.watch_startpos = False
307 self.error_count = 0
308 BaseHTMLParser.__init__(self)
309
310 def error(self, message):
311 if self.error_count > 10 or self.started:
312 raise compat_html_parser.HTMLParseError(message, self.getpos())
313 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
314 self.error_count += 1
315 self.goahead(1)
316
317 def handle_starttag(self, tag, attrs):
318 attrs = dict(attrs)
319 if self.started:
320 self.find_startpos(None)
321 if self.attribute in attrs and attrs[self.attribute] == self.value:
322 self.result = [tag]
323 self.started = True
324 self.watch_startpos = True
325 if self.started:
326 if not tag in self.depth: self.depth[tag] = 0
327 self.depth[tag] += 1
328
329 def handle_endtag(self, tag):
330 if self.started:
331 if tag in self.depth: self.depth[tag] -= 1
332 if self.depth[self.result[0]] == 0:
333 self.started = False
334 self.result.append(self.getpos())
335
336 def find_startpos(self, x):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self.watch_startpos:
340 self.watch_startpos = False
341 self.result.append(self.getpos())
342 handle_entityref = handle_charref = handle_data = handle_comment = \
343 handle_decl = handle_pi = unknown_decl = find_startpos
344
345 def get_result(self):
346 if self.result is None:
347 return None
348 if len(self.result) != 3:
349 return None
350 lines = self.html.split('\n')
351 lines = lines[self.result[1][0]-1:self.result[2][0]]
352 lines[0] = lines[0][self.result[1][1]:]
353 if len(lines) == 1:
354 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
355 lines[-1] = lines[-1][:self.result[2][1]]
356 return '\n'.join(lines).strip()
357 # Hack for https://github.com/rg3/youtube-dl/issues/662
358 if sys.version_info < (2, 7, 3):
359 AttrParser.parse_endtag = (lambda self, i:
360 i + len("</scr'+'ipt>")
361 if self.rawdata[i:].startswith("</scr'+'ipt>")
362 else compat_html_parser.HTMLParser.parse_endtag(self, i))
363
364 def get_element_by_id(id, html):
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html)
367
368 def get_element_by_attribute(attribute, value, html):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser = AttrParser(attribute, value)
371 try:
372 parser.loads(html)
373 except compat_html_parser.HTMLParseError:
374 pass
375 return parser.get_result()
376
377 class MetaParser(BaseHTMLParser):
378 """
379 Modified HTMLParser that isolates a meta tag with the specified name
380 attribute.
381 """
382 def __init__(self, name):
383 BaseHTMLParser.__init__(self)
384 self.name = name
385 self.content = None
386 self.result = None
387
388 def handle_starttag(self, tag, attrs):
389 if tag != 'meta':
390 return
391 attrs = dict(attrs)
392 if attrs.get('name') == self.name:
393 self.result = attrs.get('content')
394
395 def get_result(self):
396 return self.result
397
398 def get_meta_content(name, html):
399 """
400 Return the content attribute from the meta tag with the given name attribute.
401 """
402 parser = MetaParser(name)
403 try:
404 parser.loads(html)
405 except compat_html_parser.HTMLParseError:
406 pass
407 return parser.get_result()
408
409
410 def clean_html(html):
411 """Clean an HTML snippet into a readable string"""
412 # Newline vs <br />
413 html = html.replace('\n', ' ')
414 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
415 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
416 # Strip html tags
417 html = re.sub('<.*?>', '', html)
418 # Replace html entities
419 html = unescapeHTML(html)
420 return html.strip()
421
422
423 def sanitize_open(filename, open_mode):
424 """Try to open the given filename, and slightly tweak it if this fails.
425
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
429 function.
430
431 It returns the tuple (stream, definitive_file_name).
432 """
433 try:
434 if filename == u'-':
435 if sys.platform == 'win32':
436 import msvcrt
437 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
438 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, filename)
441 except (IOError, OSError) as err:
442 if err.errno in (errno.EACCES,):
443 raise
444
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename = os.path.join(
447 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
448 for path_part in os.path.split(filename)
449 )
450 if alt_filename == filename:
451 raise
452 else:
453 # An exception here should be caught in the caller
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, alt_filename)
456
457
458 def timeconvert(timestr):
459 """Convert RFC 2822 defined time string into system timestamp"""
460 timestamp = None
461 timetuple = email.utils.parsedate_tz(timestr)
462 if timetuple is not None:
463 timestamp = email.utils.mktime_tz(timetuple)
464 return timestamp
465
466 def sanitize_filename(s, restricted=False, is_id=False):
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
470 """
471 def replace_insane(char):
472 if char == '?' or ord(char) < 32 or ord(char) == 127:
473 return ''
474 elif char == '"':
475 return '' if restricted else '\''
476 elif char == ':':
477 return '_-' if restricted else ' -'
478 elif char in '\\/|*<>':
479 return '_'
480 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
481 return '_'
482 if restricted and ord(char) > 127:
483 return '_'
484 return char
485
486 result = u''.join(map(replace_insane, s))
487 if not is_id:
488 while '__' in result:
489 result = result.replace('__', '_')
490 result = result.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted and result.startswith('-_'):
493 result = result[2:]
494 if not result:
495 result = '_'
496 return result
497
498 def orderedSet(iterable):
499 """ Remove all duplicates from the input iterable """
500 res = []
501 for el in iterable:
502 if el not in res:
503 res.append(el)
504 return res
505
506
507 def unescapeHTML(s):
508 if s is None:
509 return None
510 assert type(s) == compat_str
511
512 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
513 return result
514
515
516 def encodeFilename(s, for_subprocess=False):
517 """
518 @param s The name of the file
519 """
520
521 assert type(s) == compat_str
522
523 # Python 3 has a Unicode API
524 if sys.version_info >= (3, 0):
525 return s
526
527 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
531 if not for_subprocess:
532 return s
533 else:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding = preferredencoding()
537 else:
538 encoding = sys.getfilesystemencoding()
539 if encoding is None:
540 encoding = 'utf-8'
541 return s.encode(encoding, 'ignore')
542
543 def decodeOption(optval):
544 if optval is None:
545 return optval
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
548
549 assert isinstance(optval, compat_str)
550 return optval
551
552 def formatSeconds(secs):
553 if secs > 3600:
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
555 elif secs > 60:
556 return '%d:%02d' % (secs // 60, secs % 60)
557 else:
558 return '%d' % secs
559
560
561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
562 if sys.version_info < (3, 2):
563 import httplib
564
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
568
569 def connect(self):
570 sock = socket.create_connection((self.host, self.port), self.timeout)
571 if getattr(self, '_tunnel_host', False):
572 self.sock = sock
573 self._tunnel()
574 try:
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
576 except ssl.SSLError:
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
578
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
582 return HTTPSHandlerV3(**kwargs)
583 else:
584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
585 context.verify_mode = (ssl.CERT_NONE
586 if opts_no_check_certificate
587 else ssl.CERT_REQUIRED)
588 context.set_default_verify_paths()
589 try:
590 context.load_default_certs()
591 except AttributeError:
592 pass # Python < 3.4
593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
594
595 class ExtractorError(Exception):
596 """Error during info extraction."""
597 def __init__(self, msg, tb=None, expected=False, cause=None):
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
600 """
601
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
603 expected = True
604 if not expected:
605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
606 super(ExtractorError, self).__init__(msg)
607
608 self.traceback = tb
609 self.exc_info = sys.exc_info() # preserve original exception
610 self.cause = cause
611
612 def format_traceback(self):
613 if self.traceback is None:
614 return None
615 return u''.join(traceback.format_tb(self.traceback))
616
617
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
620 pass
621
622
623 class DownloadError(Exception):
624 """Download Error exception.
625
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
628 error message.
629 """
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
634
635
636 class SameFileError(Exception):
637 """Same File exception.
638
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
641 """
642 pass
643
644
645 class PostProcessingError(Exception):
646 """Post Processing exception.
647
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
650 """
651 def __init__(self, msg):
652 self.msg = msg
653
654 class MaxDownloadsReached(Exception):
655 """ --max-downloads limit has been reached. """
656 pass
657
658
659 class UnavailableVideoError(Exception):
660 """Unavailable Format exception.
661
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
664 """
665 pass
666
667
668 class ContentTooShortError(Exception):
669 """Content Too Short exception.
670
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
674 """
675 # Both in bytes
676 downloaded = None
677 expected = None
678
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
682
683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
684 """Handler for HTTP requests and responses.
685
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
692
693 Part of this code was copied from:
694
695 http://techknack.net/python-urllib2-handlers/
696
697 Andrew Rowls, the author of that code, agreed to release it to the
698 public domain.
699 """
700
701 @staticmethod
702 def deflate(data):
703 try:
704 return zlib.decompress(data, -zlib.MAX_WBITS)
705 except zlib.error:
706 return zlib.decompress(data)
707
708 @staticmethod
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
713 ret.code = code
714 return ret
715
716 def http_request(self, req):
717 for h,v in std_headers.items():
718 if h in req.headers:
719 del req.headers[h]
720 req.add_header(h, v)
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
725 if 'Youtubedl-user-agent' in req.headers:
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
729 del req.headers['Youtubedl-user-agent']
730 return req
731
732 def http_response(self, req, resp):
733 old_resp = resp
734 # gzip
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
738 try:
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
744 try:
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
747 except IOError:
748 continue
749 break
750 else:
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
754 # deflate
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
759 return resp
760
761 https_request = http_request
762 https_response = http_response
763
764
765 def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
767
768 if date_str is None:
769 return None
770
771 m = re.search(
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
773 date_str)
774 if not m:
775 timezone = datetime.timedelta()
776 else:
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
780 else:
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
785
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
788
789
790 def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
792
793 if date_str is None:
794 return None
795
796 upload_date = None
797 #Replace commas
798 date_str = date_str.replace(',', ' ')
799 # %z (UTC offset) is only supported in python>=3.2
800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
801 format_expressions = [
802 '%d %B %Y',
803 '%d %b %Y',
804 '%B %d %Y',
805 '%b %d %Y',
806 '%Y-%m-%d',
807 '%d.%m.%Y',
808 '%d/%m/%Y',
809 '%Y/%m/%d %H:%M:%S',
810 '%Y-%m-%d %H:%M:%S',
811 '%d.%m.%Y %H:%M',
812 '%d.%m.%Y %H.%M',
813 '%Y-%m-%dT%H:%M:%SZ',
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
816 '%Y-%m-%dT%H:%M:%S',
817 '%Y-%m-%dT%H:%M:%S.%f',
818 '%Y-%m-%dT%H:%M',
819 ]
820 for expression in format_expressions:
821 try:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
823 except ValueError:
824 pass
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
827 if timetuple:
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
829 return upload_date
830
831 def determine_ext(url, default_ext=u'unknown_video'):
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
834 return guess
835 else:
836 return default_ext
837
838 def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
840
841 def date_from_str(date_str):
842 """
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
847 return today
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
852 if sign == '-':
853 time = -time
854 unit = match.group('unit')
855 #A bad aproximation?
856 if unit == 'month':
857 unit = 'day'
858 time *= 30
859 elif unit == 'year':
860 unit = 'day'
861 time *= 365
862 unit += 's'
863 delta = datetime.timedelta(**{unit: time})
864 return today + delta
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
866
867 def hyphenate_date(date_str):
868 """
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
873 else:
874 return date_str
875
876 class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
882 else:
883 self.start = datetime.datetime.min.date()
884 if end is not None:
885 self.end = date_from_str(end)
886 else:
887 self.end = datetime.datetime.max.date()
888 if self.start > self.end:
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
890 @classmethod
891 def day(cls, day):
892 """Returns a range that only contains the given day"""
893 return cls(day,day)
894 def __contains__(self, date):
895 """Check if the date is in the range"""
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
899 def __str__(self):
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
901
902
903 def platform_name():
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
908
909 assert isinstance(res, compat_str)
910 return res
911
912
913 def write_string(s, out=None, encoding=None):
914 if out is None:
915 out = sys.stderr
916 assert type(s) == compat_str
917
918 if ('b' in getattr(out, 'mode', '') or
919 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
920 s = s.encode(encoding or preferredencoding(), 'ignore')
921 try:
922 out.write(s)
923 except UnicodeEncodeError:
924 # In Windows shells, this can fail even when the codec is just charmap!?
925 # See https://wiki.python.org/moin/PrintFails#Issue
926 if sys.platform == 'win32':
927 if not encoding and hasattr(out, 'encoding'):
928 encoding = out.encoding
929 if encoding:
930 b = s.encode(encoding, 'ignore').decode(encoding)
931 out.write(b)
932 else:
933 raise
934
935 out.flush()
936
937
938 def bytes_to_intlist(bs):
939 if not bs:
940 return []
941 if isinstance(bs[0], int): # Python 3
942 return list(bs)
943 else:
944 return [ord(c) for c in bs]
945
946
947 def intlist_to_bytes(xs):
948 if not xs:
949 return b''
950 if isinstance(chr(0), bytes): # Python 2
951 return ''.join([chr(x) for x in xs])
952 else:
953 return bytes(xs)
954
955
956 def get_cachedir(params={}):
957 cache_root = os.environ.get('XDG_CACHE_HOME',
958 os.path.expanduser('~/.cache'))
959 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
960
961
962 # Cross-platform file locking
963 if sys.platform == 'win32':
964 import ctypes.wintypes
965 import msvcrt
966
967 class OVERLAPPED(ctypes.Structure):
968 _fields_ = [
969 ('Internal', ctypes.wintypes.LPVOID),
970 ('InternalHigh', ctypes.wintypes.LPVOID),
971 ('Offset', ctypes.wintypes.DWORD),
972 ('OffsetHigh', ctypes.wintypes.DWORD),
973 ('hEvent', ctypes.wintypes.HANDLE),
974 ]
975
976 kernel32 = ctypes.windll.kernel32
977 LockFileEx = kernel32.LockFileEx
978 LockFileEx.argtypes = [
979 ctypes.wintypes.HANDLE, # hFile
980 ctypes.wintypes.DWORD, # dwFlags
981 ctypes.wintypes.DWORD, # dwReserved
982 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
983 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
984 ctypes.POINTER(OVERLAPPED) # Overlapped
985 ]
986 LockFileEx.restype = ctypes.wintypes.BOOL
987 UnlockFileEx = kernel32.UnlockFileEx
988 UnlockFileEx.argtypes = [
989 ctypes.wintypes.HANDLE, # hFile
990 ctypes.wintypes.DWORD, # dwReserved
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
992 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
993 ctypes.POINTER(OVERLAPPED) # Overlapped
994 ]
995 UnlockFileEx.restype = ctypes.wintypes.BOOL
996 whole_low = 0xffffffff
997 whole_high = 0x7fffffff
998
999 def _lock_file(f, exclusive):
1000 overlapped = OVERLAPPED()
1001 overlapped.Offset = 0
1002 overlapped.OffsetHigh = 0
1003 overlapped.hEvent = 0
1004 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1009
1010 def _unlock_file(f):
1011 assert f._lock_file_overlapped_p
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not UnlockFileEx(handle, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1016
1017 else:
1018 import fcntl
1019
1020 def _lock_file(f, exclusive):
1021 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1022
1023 def _unlock_file(f):
1024 fcntl.lockf(f, fcntl.LOCK_UN)
1025
1026
1027 class locked_file(object):
1028 def __init__(self, filename, mode, encoding=None):
1029 assert mode in ['r', 'a', 'w']
1030 self.f = io.open(filename, mode, encoding=encoding)
1031 self.mode = mode
1032
1033 def __enter__(self):
1034 exclusive = self.mode != 'r'
1035 try:
1036 _lock_file(self.f, exclusive)
1037 except IOError:
1038 self.f.close()
1039 raise
1040 return self
1041
1042 def __exit__(self, etype, value, traceback):
1043 try:
1044 _unlock_file(self.f)
1045 finally:
1046 self.f.close()
1047
1048 def __iter__(self):
1049 return iter(self.f)
1050
1051 def write(self, *args):
1052 return self.f.write(*args)
1053
1054 def read(self, *args):
1055 return self.f.read(*args)
1056
1057
1058 def shell_quote(args):
1059 quoted_args = []
1060 encoding = sys.getfilesystemencoding()
1061 if encoding is None:
1062 encoding = 'utf-8'
1063 for a in args:
1064 if isinstance(a, bytes):
1065 # We may get a filename encoded with 'encodeFilename'
1066 a = a.decode(encoding)
1067 quoted_args.append(pipes.quote(a))
1068 return u' '.join(quoted_args)
1069
1070
1071 def takewhile_inclusive(pred, seq):
1072 """ Like itertools.takewhile, but include the latest evaluated element
1073 (the first element so that Not pred(e)) """
1074 for e in seq:
1075 yield e
1076 if not pred(e):
1077 return
1078
1079
1080 def smuggle_url(url, data):
1081 """ Pass additional data in a URL for internal use. """
1082
1083 sdata = compat_urllib_parse.urlencode(
1084 {u'__youtubedl_smuggle': json.dumps(data)})
1085 return url + u'#' + sdata
1086
1087
1088 def unsmuggle_url(smug_url, default=None):
1089 if not '#__youtubedl_smuggle' in smug_url:
1090 return smug_url, default
1091 url, _, sdata = smug_url.rpartition(u'#')
1092 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1093 data = json.loads(jsond)
1094 return url, data
1095
1096
1097 def format_bytes(bytes):
1098 if bytes is None:
1099 return u'N/A'
1100 if type(bytes) is str:
1101 bytes = float(bytes)
1102 if bytes == 0.0:
1103 exponent = 0
1104 else:
1105 exponent = int(math.log(bytes, 1024.0))
1106 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1107 converted = float(bytes) / float(1024 ** exponent)
1108 return u'%.2f%s' % (converted, suffix)
1109
1110
1111 def str_to_int(int_str):
1112 int_str = re.sub(r'[,\.]', u'', int_str)
1113 return int(int_str)
1114
1115
1116 def get_term_width():
1117 columns = os.environ.get('COLUMNS', None)
1118 if columns:
1119 return int(columns)
1120
1121 try:
1122 sp = subprocess.Popen(
1123 ['stty', 'size'],
1124 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1125 out, err = sp.communicate()
1126 return int(out.split()[1])
1127 except:
1128 pass
1129 return None
1130
1131
1132 def month_by_name(name):
1133 """ Return the number of a month by (locale-independently) English name """
1134
1135 ENGLISH_NAMES = [
1136 u'January', u'February', u'March', u'April', u'May', u'June',
1137 u'July', u'August', u'September', u'October', u'November', u'December']
1138 try:
1139 return ENGLISH_NAMES.index(name) + 1
1140 except ValueError:
1141 return None
1142
1143
1144 def fix_xml_ampersands(xml_str):
1145 """Replace all the '&' by '&amp;' in XML"""
1146 return re.sub(
1147 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1148 u'&amp;',
1149 xml_str)
1150
1151
1152 def setproctitle(title):
1153 assert isinstance(title, compat_str)
1154 try:
1155 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1156 except OSError:
1157 return
1158 title_bytes = title.encode('utf-8')
1159 buf = ctypes.create_string_buffer(len(title_bytes))
1160 buf.value = title_bytes
1161 try:
1162 libc.prctl(15, buf, 0, 0, 0)
1163 except AttributeError:
1164 return # Strange libc, just skip this
1165
1166
1167 def remove_start(s, start):
1168 if s.startswith(start):
1169 return s[len(start):]
1170 return s
1171
1172
1173 def url_basename(url):
1174 path = compat_urlparse.urlparse(url).path
1175 return path.strip(u'/').split(u'/')[-1]
1176
1177
1178 class HEADRequest(compat_urllib_request.Request):
1179 def get_method(self):
1180 return "HEAD"
1181
1182
1183 def int_or_none(v, scale=1, default=None):
1184 return default if v is None else (int(v) // scale)
1185
1186
1187 def float_or_none(v, scale=1, default=None):
1188 return default if v is None else (float(v) / scale)
1189
1190
1191 def parse_duration(s):
1192 if s is None:
1193 return None
1194
1195 m = re.match(
1196 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1197 if not m:
1198 return None
1199 res = int(m.group('secs'))
1200 if m.group('mins'):
1201 res += int(m.group('mins')) * 60
1202 if m.group('hours'):
1203 res += int(m.group('hours')) * 60 * 60
1204 return res
1205
1206
1207 def prepend_extension(filename, ext):
1208 name, real_ext = os.path.splitext(filename)
1209 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1210
1211
1212 def check_executable(exe, args=[]):
1213 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1214 args can be a list of arguments for a short output (like -version) """
1215 try:
1216 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1217 except OSError:
1218 return False
1219 return exe
1220
1221
1222 class PagedList(object):
1223 def __init__(self, pagefunc, pagesize):
1224 self._pagefunc = pagefunc
1225 self._pagesize = pagesize
1226
1227 def __len__(self):
1228 # This is only useful for tests
1229 return len(self.getslice())
1230
1231 def getslice(self, start=0, end=None):
1232 res = []
1233 for pagenum in itertools.count(start // self._pagesize):
1234 firstid = pagenum * self._pagesize
1235 nextfirstid = pagenum * self._pagesize + self._pagesize
1236 if start >= nextfirstid:
1237 continue
1238
1239 page_results = list(self._pagefunc(pagenum))
1240
1241 startv = (
1242 start % self._pagesize
1243 if firstid <= start < nextfirstid
1244 else 0)
1245
1246 endv = (
1247 ((end - 1) % self._pagesize) + 1
1248 if (end is not None and firstid <= end <= nextfirstid)
1249 else None)
1250
1251 if startv != 0 or endv is not None:
1252 page_results = page_results[startv:endv]
1253 res.extend(page_results)
1254
1255 # A little optimization - if current page is not "full", ie. does
1256 # not contain page_size videos then we can assume that this page
1257 # is the last one - there are no more ids on further pages -
1258 # i.e. no need to query again.
1259 if len(page_results) + startv < self._pagesize:
1260 break
1261
1262 # If we got the whole page, but the next page is not interesting,
1263 # break out early as well
1264 if end == nextfirstid:
1265 break
1266 return res
1267
1268
1269 def uppercase_escape(s):
1270 unicode_escape = codecs.getdecoder('unicode_escape')
1271 return re.sub(
1272 r'\\U[0-9a-fA-F]{8}',
1273 lambda m: unicode_escape(m.group(0))[0],
1274 s)
1275
1276 try:
1277 struct.pack(u'!I', 0)
1278 except TypeError:
1279 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1280 def struct_pack(spec, *args):
1281 if isinstance(spec, compat_str):
1282 spec = spec.encode('ascii')
1283 return struct.pack(spec, *args)
1284
1285 def struct_unpack(spec, *args):
1286 if isinstance(spec, compat_str):
1287 spec = spec.encode('ascii')
1288 return struct.unpack(spec, *args)
1289 else:
1290 struct_pack = struct.pack
1291 struct_unpack = struct.unpack
1292
1293
1294 def read_batch_urls(batch_fd):
1295 def fixup(url):
1296 if not isinstance(url, compat_str):
1297 url = url.decode('utf-8', 'replace')
1298 BOM_UTF8 = u'\xef\xbb\xbf'
1299 if url.startswith(BOM_UTF8):
1300 url = url[len(BOM_UTF8):]
1301 url = url.strip()
1302 if url.startswith(('#', ';', ']')):
1303 return False
1304 return url
1305
1306 with contextlib.closing(batch_fd) as fd:
1307 return [url for url in map(fixup, fd) if url]
1308
1309
1310 def urlencode_postdata(*args, **kargs):
1311 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1312
1313
1314 def parse_xml(s):
1315 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1316 def doctype(self, name, pubid, system):
1317 pass # Ignore doctypes
1318
1319 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1320 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1321 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1322
1323
1324 if sys.version_info < (3, 0) and sys.platform == 'win32':
1325 def compat_getpass(prompt, *args, **kwargs):
1326 if isinstance(prompt, compat_str):
1327 prompt = prompt.encode(preferredencoding())
1328 return getpass.getpass(prompt, *args, **kwargs)
1329 else:
1330 compat_getpass = getpass.getpass
1331
1332
1333 US_RATINGS = {
1334 'G': 0,
1335 'PG': 10,
1336 'PG-13': 13,
1337 'R': 16,
1338 'NC': 18,
1339 }
1340
1341
1342 def strip_jsonp(code):
1343 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)