]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
toypics.net support
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import contextlib
5 import ctypes
6 import datetime
7 import email.utils
8 import errno
9 import getpass
10 import gzip
11 import itertools
12 import io
13 import json
14 import locale
15 import math
16 import os
17 import pipes
18 import platform
19 import re
20 import ssl
21 import socket
22 import struct
23 import subprocess
24 import sys
25 import traceback
26 import xml.etree.ElementTree
27 import zlib
28
29 try:
30 import urllib.request as compat_urllib_request
31 except ImportError: # Python 2
32 import urllib2 as compat_urllib_request
33
34 try:
35 import urllib.error as compat_urllib_error
36 except ImportError: # Python 2
37 import urllib2 as compat_urllib_error
38
39 try:
40 import urllib.parse as compat_urllib_parse
41 except ImportError: # Python 2
42 import urllib as compat_urllib_parse
43
44 try:
45 from urllib.parse import urlparse as compat_urllib_parse_urlparse
46 except ImportError: # Python 2
47 from urlparse import urlparse as compat_urllib_parse_urlparse
48
49 try:
50 import urllib.parse as compat_urlparse
51 except ImportError: # Python 2
52 import urlparse as compat_urlparse
53
54 try:
55 import http.cookiejar as compat_cookiejar
56 except ImportError: # Python 2
57 import cookielib as compat_cookiejar
58
59 try:
60 import html.entities as compat_html_entities
61 except ImportError: # Python 2
62 import htmlentitydefs as compat_html_entities
63
64 try:
65 import html.parser as compat_html_parser
66 except ImportError: # Python 2
67 import HTMLParser as compat_html_parser
68
69 try:
70 import http.client as compat_http_client
71 except ImportError: # Python 2
72 import httplib as compat_http_client
73
74 try:
75 from urllib.error import HTTPError as compat_HTTPError
76 except ImportError: # Python 2
77 from urllib2 import HTTPError as compat_HTTPError
78
79 try:
80 from urllib.request import urlretrieve as compat_urlretrieve
81 except ImportError: # Python 2
82 from urllib import urlretrieve as compat_urlretrieve
83
84
85 try:
86 from subprocess import DEVNULL
87 compat_subprocess_get_DEVNULL = lambda: DEVNULL
88 except ImportError:
89 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
90
91 try:
92 from urllib.parse import parse_qs as compat_parse_qs
93 except ImportError: # Python 2
94 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
95 # Python 2's version is apparently totally broken
96 def _unquote(string, encoding='utf-8', errors='replace'):
97 if string == '':
98 return string
99 res = string.split('%')
100 if len(res) == 1:
101 return string
102 if encoding is None:
103 encoding = 'utf-8'
104 if errors is None:
105 errors = 'replace'
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
107 pct_sequence = b''
108 string = res[0]
109 for item in res[1:]:
110 try:
111 if not item:
112 raise ValueError
113 pct_sequence += item[:2].decode('hex')
114 rest = item[2:]
115 if not rest:
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
119 continue
120 except ValueError:
121 rest = '%' + item
122 # Encountered non-percent-encoded characters. Flush the current
123 # pct_sequence.
124 string += pct_sequence.decode(encoding, errors) + rest
125 pct_sequence = b''
126 if pct_sequence:
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
129 return string
130
131 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
133 qs, _coerce_result = qs, unicode
134 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
135 r = []
136 for name_value in pairs:
137 if not name_value and not strict_parsing:
138 continue
139 nv = name_value.split('=', 1)
140 if len(nv) != 2:
141 if strict_parsing:
142 raise ValueError("bad query field: %r" % (name_value,))
143 # Handle case of a control-name with no equal sign
144 if keep_blank_values:
145 nv.append('')
146 else:
147 continue
148 if len(nv[1]) or keep_blank_values:
149 name = nv[0].replace('+', ' ')
150 name = _unquote(name, encoding=encoding, errors=errors)
151 name = _coerce_result(name)
152 value = nv[1].replace('+', ' ')
153 value = _unquote(value, encoding=encoding, errors=errors)
154 value = _coerce_result(value)
155 r.append((name, value))
156 return r
157
158 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
159 encoding='utf-8', errors='replace'):
160 parsed_result = {}
161 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
162 encoding=encoding, errors=errors)
163 for name, value in pairs:
164 if name in parsed_result:
165 parsed_result[name].append(value)
166 else:
167 parsed_result[name] = [value]
168 return parsed_result
169
170 try:
171 compat_str = unicode # Python 2
172 except NameError:
173 compat_str = str
174
175 try:
176 compat_chr = unichr # Python 2
177 except NameError:
178 compat_chr = chr
179
180 try:
181 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
182 except ImportError: # Python 2.6
183 from xml.parsers.expat import ExpatError as compat_xml_parse_error
184
185 def compat_ord(c):
186 if type(c) is int: return c
187 else: return ord(c)
188
189 # This is not clearly defined otherwise
190 compiled_regex_type = type(re.compile(''))
191
192 std_headers = {
193 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
194 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
195 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
196 'Accept-Encoding': 'gzip, deflate',
197 'Accept-Language': 'en-us,en;q=0.5',
198 }
199
200 def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 try:
207 pref = locale.getpreferredencoding()
208 u'TEST'.encode(pref)
209 except:
210 pref = 'UTF-8'
211
212 return pref
213
214 if sys.version_info < (3,0):
215 def compat_print(s):
216 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
217 else:
218 def compat_print(s):
219 assert type(s) == type(u'')
220 print(s)
221
222 # In Python 2.x, json.dump expects a bytestream.
223 # In Python 3.x, it writes to a character stream
224 if sys.version_info < (3,0):
225 def write_json_file(obj, fn):
226 with open(fn, 'wb') as f:
227 json.dump(obj, f)
228 else:
229 def write_json_file(obj, fn):
230 with open(fn, 'w', encoding='utf-8') as f:
231 json.dump(obj, f)
232
233 if sys.version_info >= (2,7):
234 def find_xpath_attr(node, xpath, key, val):
235 """ Find the xpath xpath[@key=val] """
236 assert re.match(r'^[a-zA-Z]+$', key)
237 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
238 expr = xpath + u"[@%s='%s']" % (key, val)
239 return node.find(expr)
240 else:
241 def find_xpath_attr(node, xpath, key, val):
242 for f in node.findall(xpath):
243 if f.attrib.get(key) == val:
244 return f
245 return None
246
247 # On python2.6 the xml.etree.ElementTree.Element methods don't support
248 # the namespace parameter
249 def xpath_with_ns(path, ns_map):
250 components = [c.split(':') for c in path.split('/')]
251 replaced = []
252 for c in components:
253 if len(c) == 1:
254 replaced.append(c[0])
255 else:
256 ns, tag = c
257 replaced.append('{%s}%s' % (ns_map[ns], tag))
258 return '/'.join(replaced)
259
260 def htmlentity_transform(matchobj):
261 """Transforms an HTML entity to a character.
262
263 This function receives a match object and is intended to be used with
264 the re.sub() function.
265 """
266 entity = matchobj.group(1)
267
268 # Known non-numeric HTML entity
269 if entity in compat_html_entities.name2codepoint:
270 return compat_chr(compat_html_entities.name2codepoint[entity])
271
272 mobj = re.match(u'(?u)#(x?\\d+)', entity)
273 if mobj is not None:
274 numstr = mobj.group(1)
275 if numstr.startswith(u'x'):
276 base = 16
277 numstr = u'0%s' % numstr
278 else:
279 base = 10
280 return compat_chr(int(numstr, base))
281
282 # Unknown entity in name, return its literal representation
283 return (u'&%s;' % entity)
284
285 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
286 class BaseHTMLParser(compat_html_parser.HTMLParser):
287 def __init(self):
288 compat_html_parser.HTMLParser.__init__(self)
289 self.html = None
290
291 def loads(self, html):
292 self.html = html
293 self.feed(html)
294 self.close()
295
296 class AttrParser(BaseHTMLParser):
297 """Modified HTMLParser that isolates a tag with the specified attribute"""
298 def __init__(self, attribute, value):
299 self.attribute = attribute
300 self.value = value
301 self.result = None
302 self.started = False
303 self.depth = {}
304 self.watch_startpos = False
305 self.error_count = 0
306 BaseHTMLParser.__init__(self)
307
308 def error(self, message):
309 if self.error_count > 10 or self.started:
310 raise compat_html_parser.HTMLParseError(message, self.getpos())
311 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
312 self.error_count += 1
313 self.goahead(1)
314
315 def handle_starttag(self, tag, attrs):
316 attrs = dict(attrs)
317 if self.started:
318 self.find_startpos(None)
319 if self.attribute in attrs and attrs[self.attribute] == self.value:
320 self.result = [tag]
321 self.started = True
322 self.watch_startpos = True
323 if self.started:
324 if not tag in self.depth: self.depth[tag] = 0
325 self.depth[tag] += 1
326
327 def handle_endtag(self, tag):
328 if self.started:
329 if tag in self.depth: self.depth[tag] -= 1
330 if self.depth[self.result[0]] == 0:
331 self.started = False
332 self.result.append(self.getpos())
333
334 def find_startpos(self, x):
335 """Needed to put the start position of the result (self.result[1])
336 after the opening tag with the requested id"""
337 if self.watch_startpos:
338 self.watch_startpos = False
339 self.result.append(self.getpos())
340 handle_entityref = handle_charref = handle_data = handle_comment = \
341 handle_decl = handle_pi = unknown_decl = find_startpos
342
343 def get_result(self):
344 if self.result is None:
345 return None
346 if len(self.result) != 3:
347 return None
348 lines = self.html.split('\n')
349 lines = lines[self.result[1][0]-1:self.result[2][0]]
350 lines[0] = lines[0][self.result[1][1]:]
351 if len(lines) == 1:
352 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
353 lines[-1] = lines[-1][:self.result[2][1]]
354 return '\n'.join(lines).strip()
355 # Hack for https://github.com/rg3/youtube-dl/issues/662
356 if sys.version_info < (2, 7, 3):
357 AttrParser.parse_endtag = (lambda self, i:
358 i + len("</scr'+'ipt>")
359 if self.rawdata[i:].startswith("</scr'+'ipt>")
360 else compat_html_parser.HTMLParser.parse_endtag(self, i))
361
362 def get_element_by_id(id, html):
363 """Return the content of the tag with the specified ID in the passed HTML document"""
364 return get_element_by_attribute("id", id, html)
365
366 def get_element_by_attribute(attribute, value, html):
367 """Return the content of the tag with the specified attribute in the passed HTML document"""
368 parser = AttrParser(attribute, value)
369 try:
370 parser.loads(html)
371 except compat_html_parser.HTMLParseError:
372 pass
373 return parser.get_result()
374
375 class MetaParser(BaseHTMLParser):
376 """
377 Modified HTMLParser that isolates a meta tag with the specified name
378 attribute.
379 """
380 def __init__(self, name):
381 BaseHTMLParser.__init__(self)
382 self.name = name
383 self.content = None
384 self.result = None
385
386 def handle_starttag(self, tag, attrs):
387 if tag != 'meta':
388 return
389 attrs = dict(attrs)
390 if attrs.get('name') == self.name:
391 self.result = attrs.get('content')
392
393 def get_result(self):
394 return self.result
395
396 def get_meta_content(name, html):
397 """
398 Return the content attribute from the meta tag with the given name attribute.
399 """
400 parser = MetaParser(name)
401 try:
402 parser.loads(html)
403 except compat_html_parser.HTMLParseError:
404 pass
405 return parser.get_result()
406
407
408 def clean_html(html):
409 """Clean an HTML snippet into a readable string"""
410 # Newline vs <br />
411 html = html.replace('\n', ' ')
412 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
413 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
414 # Strip html tags
415 html = re.sub('<.*?>', '', html)
416 # Replace html entities
417 html = unescapeHTML(html)
418 return html.strip()
419
420
421 def sanitize_open(filename, open_mode):
422 """Try to open the given filename, and slightly tweak it if this fails.
423
424 Attempts to open the given filename. If this fails, it tries to change
425 the filename slightly, step by step, until it's either able to open it
426 or it fails and raises a final exception, like the standard open()
427 function.
428
429 It returns the tuple (stream, definitive_file_name).
430 """
431 try:
432 if filename == u'-':
433 if sys.platform == 'win32':
434 import msvcrt
435 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
436 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
437 stream = open(encodeFilename(filename), open_mode)
438 return (stream, filename)
439 except (IOError, OSError) as err:
440 if err.errno in (errno.EACCES,):
441 raise
442
443 # In case of error, try to remove win32 forbidden chars
444 alt_filename = os.path.join(
445 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
446 for path_part in os.path.split(filename)
447 )
448 if alt_filename == filename:
449 raise
450 else:
451 # An exception here should be caught in the caller
452 stream = open(encodeFilename(filename), open_mode)
453 return (stream, alt_filename)
454
455
456 def timeconvert(timestr):
457 """Convert RFC 2822 defined time string into system timestamp"""
458 timestamp = None
459 timetuple = email.utils.parsedate_tz(timestr)
460 if timetuple is not None:
461 timestamp = email.utils.mktime_tz(timetuple)
462 return timestamp
463
464 def sanitize_filename(s, restricted=False, is_id=False):
465 """Sanitizes a string so it could be used as part of a filename.
466 If restricted is set, use a stricter subset of allowed characters.
467 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
468 """
469 def replace_insane(char):
470 if char == '?' or ord(char) < 32 or ord(char) == 127:
471 return ''
472 elif char == '"':
473 return '' if restricted else '\''
474 elif char == ':':
475 return '_-' if restricted else ' -'
476 elif char in '\\/|*<>':
477 return '_'
478 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
479 return '_'
480 if restricted and ord(char) > 127:
481 return '_'
482 return char
483
484 result = u''.join(map(replace_insane, s))
485 if not is_id:
486 while '__' in result:
487 result = result.replace('__', '_')
488 result = result.strip('_')
489 # Common case of "Foreign band name - English song title"
490 if restricted and result.startswith('-_'):
491 result = result[2:]
492 if not result:
493 result = '_'
494 return result
495
496 def orderedSet(iterable):
497 """ Remove all duplicates from the input iterable """
498 res = []
499 for el in iterable:
500 if el not in res:
501 res.append(el)
502 return res
503
504 def unescapeHTML(s):
505 """
506 @param s a string
507 """
508 assert type(s) == type(u'')
509
510 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
511 return result
512
513
514 def encodeFilename(s, for_subprocess=False):
515 """
516 @param s The name of the file
517 """
518
519 assert type(s) == compat_str
520
521 # Python 3 has a Unicode API
522 if sys.version_info >= (3, 0):
523 return s
524
525 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
526 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
527 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
528 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
529 if not for_subprocess:
530 return s
531 else:
532 # For subprocess calls, encode with locale encoding
533 # Refer to http://stackoverflow.com/a/9951851/35070
534 encoding = preferredencoding()
535 else:
536 encoding = sys.getfilesystemencoding()
537 if encoding is None:
538 encoding = 'utf-8'
539 return s.encode(encoding, 'ignore')
540
541
542 def decodeOption(optval):
543 if optval is None:
544 return optval
545 if isinstance(optval, bytes):
546 optval = optval.decode(preferredencoding())
547
548 assert isinstance(optval, compat_str)
549 return optval
550
551 def formatSeconds(secs):
552 if secs > 3600:
553 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
554 elif secs > 60:
555 return '%d:%02d' % (secs // 60, secs % 60)
556 else:
557 return '%d' % secs
558
559
560 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
561 if sys.version_info < (3, 2):
562 import httplib
563
564 class HTTPSConnectionV3(httplib.HTTPSConnection):
565 def __init__(self, *args, **kwargs):
566 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
567
568 def connect(self):
569 sock = socket.create_connection((self.host, self.port), self.timeout)
570 if getattr(self, '_tunnel_host', False):
571 self.sock = sock
572 self._tunnel()
573 try:
574 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
575 except ssl.SSLError:
576 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
577
578 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
579 def https_open(self, req):
580 return self.do_open(HTTPSConnectionV3, req)
581 return HTTPSHandlerV3(**kwargs)
582 else:
583 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
584 context.verify_mode = (ssl.CERT_NONE
585 if opts_no_check_certificate
586 else ssl.CERT_REQUIRED)
587 context.set_default_verify_paths()
588 try:
589 context.load_default_certs()
590 except AttributeError:
591 pass # Python < 3.4
592 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
593
594 class ExtractorError(Exception):
595 """Error during info extraction."""
596 def __init__(self, msg, tb=None, expected=False, cause=None):
597 """ tb, if given, is the original traceback (so that it can be printed out).
598 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
599 """
600
601 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
602 expected = True
603 if not expected:
604 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
605 super(ExtractorError, self).__init__(msg)
606
607 self.traceback = tb
608 self.exc_info = sys.exc_info() # preserve original exception
609 self.cause = cause
610
611 def format_traceback(self):
612 if self.traceback is None:
613 return None
614 return u''.join(traceback.format_tb(self.traceback))
615
616
617 class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
619 pass
620
621
622 class DownloadError(Exception):
623 """Download Error exception.
624
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
627 error message.
628 """
629 def __init__(self, msg, exc_info=None):
630 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
631 super(DownloadError, self).__init__(msg)
632 self.exc_info = exc_info
633
634
635 class SameFileError(Exception):
636 """Same File exception.
637
638 This exception will be thrown by FileDownloader objects if they detect
639 multiple files would have to be downloaded to the same file on disk.
640 """
641 pass
642
643
644 class PostProcessingError(Exception):
645 """Post Processing exception.
646
647 This exception may be raised by PostProcessor's .run() method to
648 indicate an error in the postprocessing task.
649 """
650 def __init__(self, msg):
651 self.msg = msg
652
653 class MaxDownloadsReached(Exception):
654 """ --max-downloads limit has been reached. """
655 pass
656
657
658 class UnavailableVideoError(Exception):
659 """Unavailable Format exception.
660
661 This exception will be thrown when a video is requested
662 in a format that is not available for that video.
663 """
664 pass
665
666
667 class ContentTooShortError(Exception):
668 """Content Too Short exception.
669
670 This exception may be raised by FileDownloader objects when a file they
671 download is too small for what the server announced first, indicating
672 the connection was probably interrupted.
673 """
674 # Both in bytes
675 downloaded = None
676 expected = None
677
678 def __init__(self, downloaded, expected):
679 self.downloaded = downloaded
680 self.expected = expected
681
682 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
683 """Handler for HTTP requests and responses.
684
685 This class, when installed with an OpenerDirector, automatically adds
686 the standard headers to every HTTP request and handles gzipped and
687 deflated responses from web servers. If compression is to be avoided in
688 a particular request, the original request in the program code only has
689 to include the HTTP header "Youtubedl-No-Compression", which will be
690 removed before making the real request.
691
692 Part of this code was copied from:
693
694 http://techknack.net/python-urllib2-handlers/
695
696 Andrew Rowls, the author of that code, agreed to release it to the
697 public domain.
698 """
699
700 @staticmethod
701 def deflate(data):
702 try:
703 return zlib.decompress(data, -zlib.MAX_WBITS)
704 except zlib.error:
705 return zlib.decompress(data)
706
707 @staticmethod
708 def addinfourl_wrapper(stream, headers, url, code):
709 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
710 return compat_urllib_request.addinfourl(stream, headers, url, code)
711 ret = compat_urllib_request.addinfourl(stream, headers, url)
712 ret.code = code
713 return ret
714
715 def http_request(self, req):
716 for h,v in std_headers.items():
717 if h in req.headers:
718 del req.headers[h]
719 req.add_header(h, v)
720 if 'Youtubedl-no-compression' in req.headers:
721 if 'Accept-encoding' in req.headers:
722 del req.headers['Accept-encoding']
723 del req.headers['Youtubedl-no-compression']
724 if 'Youtubedl-user-agent' in req.headers:
725 if 'User-agent' in req.headers:
726 del req.headers['User-agent']
727 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
728 del req.headers['Youtubedl-user-agent']
729 return req
730
731 def http_response(self, req, resp):
732 old_resp = resp
733 # gzip
734 if resp.headers.get('Content-encoding', '') == 'gzip':
735 content = resp.read()
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
737 try:
738 uncompressed = io.BytesIO(gz.read())
739 except IOError as original_ioerror:
740 # There may be junk add the end of the file
741 # See http://stackoverflow.com/q/4928560/35070 for details
742 for i in range(1, 1024):
743 try:
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
745 uncompressed = io.BytesIO(gz.read())
746 except IOError:
747 continue
748 break
749 else:
750 raise original_ioerror
751 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
752 resp.msg = old_resp.msg
753 # deflate
754 if resp.headers.get('Content-encoding', '') == 'deflate':
755 gz = io.BytesIO(self.deflate(resp.read()))
756 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
757 resp.msg = old_resp.msg
758 return resp
759
760 https_request = http_request
761 https_response = http_response
762
763
764 def unified_strdate(date_str):
765 """Return a string with the date in the format YYYYMMDD"""
766
767 if date_str is None:
768 return None
769
770 upload_date = None
771 #Replace commas
772 date_str = date_str.replace(',', ' ')
773 # %z (UTC offset) is only supported in python>=3.2
774 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
775 format_expressions = [
776 '%d %B %Y',
777 '%d %b %Y',
778 '%B %d %Y',
779 '%b %d %Y',
780 '%Y-%m-%d',
781 '%d.%m.%Y',
782 '%d/%m/%Y',
783 '%Y/%m/%d %H:%M:%S',
784 '%Y-%m-%d %H:%M:%S',
785 '%d.%m.%Y %H:%M',
786 '%d.%m.%Y %H.%M',
787 '%Y-%m-%dT%H:%M:%SZ',
788 '%Y-%m-%dT%H:%M:%S.%fZ',
789 '%Y-%m-%dT%H:%M:%S.%f0Z',
790 '%Y-%m-%dT%H:%M:%S',
791 '%Y-%m-%dT%H:%M:%S.%f',
792 '%Y-%m-%dT%H:%M',
793 ]
794 for expression in format_expressions:
795 try:
796 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
797 except ValueError:
798 pass
799 if upload_date is None:
800 timetuple = email.utils.parsedate_tz(date_str)
801 if timetuple:
802 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
803 return upload_date
804
805 def determine_ext(url, default_ext=u'unknown_video'):
806 guess = url.partition(u'?')[0].rpartition(u'.')[2]
807 if re.match(r'^[A-Za-z0-9]+$', guess):
808 return guess
809 else:
810 return default_ext
811
812 def subtitles_filename(filename, sub_lang, sub_format):
813 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
814
815 def date_from_str(date_str):
816 """
817 Return a datetime object from a string in the format YYYYMMDD or
818 (now|today)[+-][0-9](day|week|month|year)(s)?"""
819 today = datetime.date.today()
820 if date_str == 'now'or date_str == 'today':
821 return today
822 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
823 if match is not None:
824 sign = match.group('sign')
825 time = int(match.group('time'))
826 if sign == '-':
827 time = -time
828 unit = match.group('unit')
829 #A bad aproximation?
830 if unit == 'month':
831 unit = 'day'
832 time *= 30
833 elif unit == 'year':
834 unit = 'day'
835 time *= 365
836 unit += 's'
837 delta = datetime.timedelta(**{unit: time})
838 return today + delta
839 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
840
841 def hyphenate_date(date_str):
842 """
843 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
844 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
845 if match is not None:
846 return '-'.join(match.groups())
847 else:
848 return date_str
849
850 class DateRange(object):
851 """Represents a time interval between two dates"""
852 def __init__(self, start=None, end=None):
853 """start and end must be strings in the format accepted by date"""
854 if start is not None:
855 self.start = date_from_str(start)
856 else:
857 self.start = datetime.datetime.min.date()
858 if end is not None:
859 self.end = date_from_str(end)
860 else:
861 self.end = datetime.datetime.max.date()
862 if self.start > self.end:
863 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
864 @classmethod
865 def day(cls, day):
866 """Returns a range that only contains the given day"""
867 return cls(day,day)
868 def __contains__(self, date):
869 """Check if the date is in the range"""
870 if not isinstance(date, datetime.date):
871 date = date_from_str(date)
872 return self.start <= date <= self.end
873 def __str__(self):
874 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
875
876
877 def platform_name():
878 """ Returns the platform name as a compat_str """
879 res = platform.platform()
880 if isinstance(res, bytes):
881 res = res.decode(preferredencoding())
882
883 assert isinstance(res, compat_str)
884 return res
885
886
887 def write_string(s, out=None):
888 if out is None:
889 out = sys.stderr
890 assert type(s) == compat_str
891
892 if ('b' in getattr(out, 'mode', '') or
893 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
894 s = s.encode(preferredencoding(), 'ignore')
895 try:
896 out.write(s)
897 except UnicodeEncodeError:
898 # In Windows shells, this can fail even when the codec is just charmap!?
899 # See https://wiki.python.org/moin/PrintFails#Issue
900 if sys.platform == 'win32' and hasattr(out, 'encoding'):
901 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
902 out.write(s)
903 else:
904 raise
905
906 out.flush()
907
908
909 def bytes_to_intlist(bs):
910 if not bs:
911 return []
912 if isinstance(bs[0], int): # Python 3
913 return list(bs)
914 else:
915 return [ord(c) for c in bs]
916
917
918 def intlist_to_bytes(xs):
919 if not xs:
920 return b''
921 if isinstance(chr(0), bytes): # Python 2
922 return ''.join([chr(x) for x in xs])
923 else:
924 return bytes(xs)
925
926
927 def get_cachedir(params={}):
928 cache_root = os.environ.get('XDG_CACHE_HOME',
929 os.path.expanduser('~/.cache'))
930 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
931
932
933 # Cross-platform file locking
934 if sys.platform == 'win32':
935 import ctypes.wintypes
936 import msvcrt
937
938 class OVERLAPPED(ctypes.Structure):
939 _fields_ = [
940 ('Internal', ctypes.wintypes.LPVOID),
941 ('InternalHigh', ctypes.wintypes.LPVOID),
942 ('Offset', ctypes.wintypes.DWORD),
943 ('OffsetHigh', ctypes.wintypes.DWORD),
944 ('hEvent', ctypes.wintypes.HANDLE),
945 ]
946
947 kernel32 = ctypes.windll.kernel32
948 LockFileEx = kernel32.LockFileEx
949 LockFileEx.argtypes = [
950 ctypes.wintypes.HANDLE, # hFile
951 ctypes.wintypes.DWORD, # dwFlags
952 ctypes.wintypes.DWORD, # dwReserved
953 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
954 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
955 ctypes.POINTER(OVERLAPPED) # Overlapped
956 ]
957 LockFileEx.restype = ctypes.wintypes.BOOL
958 UnlockFileEx = kernel32.UnlockFileEx
959 UnlockFileEx.argtypes = [
960 ctypes.wintypes.HANDLE, # hFile
961 ctypes.wintypes.DWORD, # dwReserved
962 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
963 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
964 ctypes.POINTER(OVERLAPPED) # Overlapped
965 ]
966 UnlockFileEx.restype = ctypes.wintypes.BOOL
967 whole_low = 0xffffffff
968 whole_high = 0x7fffffff
969
970 def _lock_file(f, exclusive):
971 overlapped = OVERLAPPED()
972 overlapped.Offset = 0
973 overlapped.OffsetHigh = 0
974 overlapped.hEvent = 0
975 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
976 handle = msvcrt.get_osfhandle(f.fileno())
977 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
978 whole_low, whole_high, f._lock_file_overlapped_p):
979 raise OSError('Locking file failed: %r' % ctypes.FormatError())
980
981 def _unlock_file(f):
982 assert f._lock_file_overlapped_p
983 handle = msvcrt.get_osfhandle(f.fileno())
984 if not UnlockFileEx(handle, 0,
985 whole_low, whole_high, f._lock_file_overlapped_p):
986 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
987
988 else:
989 import fcntl
990
991 def _lock_file(f, exclusive):
992 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
993
994 def _unlock_file(f):
995 fcntl.lockf(f, fcntl.LOCK_UN)
996
997
998 class locked_file(object):
999 def __init__(self, filename, mode, encoding=None):
1000 assert mode in ['r', 'a', 'w']
1001 self.f = io.open(filename, mode, encoding=encoding)
1002 self.mode = mode
1003
1004 def __enter__(self):
1005 exclusive = self.mode != 'r'
1006 try:
1007 _lock_file(self.f, exclusive)
1008 except IOError:
1009 self.f.close()
1010 raise
1011 return self
1012
1013 def __exit__(self, etype, value, traceback):
1014 try:
1015 _unlock_file(self.f)
1016 finally:
1017 self.f.close()
1018
1019 def __iter__(self):
1020 return iter(self.f)
1021
1022 def write(self, *args):
1023 return self.f.write(*args)
1024
1025 def read(self, *args):
1026 return self.f.read(*args)
1027
1028
1029 def shell_quote(args):
1030 quoted_args = []
1031 encoding = sys.getfilesystemencoding()
1032 if encoding is None:
1033 encoding = 'utf-8'
1034 for a in args:
1035 if isinstance(a, bytes):
1036 # We may get a filename encoded with 'encodeFilename'
1037 a = a.decode(encoding)
1038 quoted_args.append(pipes.quote(a))
1039 return u' '.join(quoted_args)
1040
1041
1042 def takewhile_inclusive(pred, seq):
1043 """ Like itertools.takewhile, but include the latest evaluated element
1044 (the first element so that Not pred(e)) """
1045 for e in seq:
1046 yield e
1047 if not pred(e):
1048 return
1049
1050
1051 def smuggle_url(url, data):
1052 """ Pass additional data in a URL for internal use. """
1053
1054 sdata = compat_urllib_parse.urlencode(
1055 {u'__youtubedl_smuggle': json.dumps(data)})
1056 return url + u'#' + sdata
1057
1058
1059 def unsmuggle_url(smug_url, default=None):
1060 if not '#__youtubedl_smuggle' in smug_url:
1061 return smug_url, default
1062 url, _, sdata = smug_url.rpartition(u'#')
1063 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1064 data = json.loads(jsond)
1065 return url, data
1066
1067
1068 def format_bytes(bytes):
1069 if bytes is None:
1070 return u'N/A'
1071 if type(bytes) is str:
1072 bytes = float(bytes)
1073 if bytes == 0.0:
1074 exponent = 0
1075 else:
1076 exponent = int(math.log(bytes, 1024.0))
1077 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1078 converted = float(bytes) / float(1024 ** exponent)
1079 return u'%.2f%s' % (converted, suffix)
1080
1081
1082 def str_to_int(int_str):
1083 int_str = re.sub(r'[,\.]', u'', int_str)
1084 return int(int_str)
1085
1086
1087 def get_term_width():
1088 columns = os.environ.get('COLUMNS', None)
1089 if columns:
1090 return int(columns)
1091
1092 try:
1093 sp = subprocess.Popen(
1094 ['stty', 'size'],
1095 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1096 out, err = sp.communicate()
1097 return int(out.split()[1])
1098 except:
1099 pass
1100 return None
1101
1102
1103 def month_by_name(name):
1104 """ Return the number of a month by (locale-independently) English name """
1105
1106 ENGLISH_NAMES = [
1107 u'January', u'February', u'March', u'April', u'May', u'June',
1108 u'July', u'August', u'September', u'October', u'November', u'December']
1109 try:
1110 return ENGLISH_NAMES.index(name) + 1
1111 except ValueError:
1112 return None
1113
1114
1115 def fix_xml_ampersands(xml_str):
1116 """Replace all the '&' by '&amp;' in XML"""
1117 return re.sub(
1118 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1119 u'&amp;',
1120 xml_str)
1121
1122
1123 def setproctitle(title):
1124 assert isinstance(title, compat_str)
1125 try:
1126 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1127 except OSError:
1128 return
1129 title = title
1130 buf = ctypes.create_string_buffer(len(title) + 1)
1131 buf.value = title.encode('utf-8')
1132 try:
1133 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1134 except AttributeError:
1135 return # Strange libc, just skip this
1136
1137
1138 def remove_start(s, start):
1139 if s.startswith(start):
1140 return s[len(start):]
1141 return s
1142
1143
1144 def url_basename(url):
1145 path = compat_urlparse.urlparse(url).path
1146 return path.strip(u'/').split(u'/')[-1]
1147
1148
1149 class HEADRequest(compat_urllib_request.Request):
1150 def get_method(self):
1151 return "HEAD"
1152
1153
1154 def int_or_none(v, scale=1):
1155 return v if v is None else (int(v) // scale)
1156
1157
1158 def parse_duration(s):
1159 if s is None:
1160 return None
1161
1162 m = re.match(
1163 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1164 if not m:
1165 return None
1166 res = int(m.group('secs'))
1167 if m.group('mins'):
1168 res += int(m.group('mins')) * 60
1169 if m.group('hours'):
1170 res += int(m.group('hours')) * 60 * 60
1171 return res
1172
1173
1174 def prepend_extension(filename, ext):
1175 name, real_ext = os.path.splitext(filename)
1176 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1177
1178
1179 def check_executable(exe, args=[]):
1180 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1181 args can be a list of arguments for a short output (like -version) """
1182 try:
1183 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1184 except OSError:
1185 return False
1186 return exe
1187
1188
1189 class PagedList(object):
1190 def __init__(self, pagefunc, pagesize):
1191 self._pagefunc = pagefunc
1192 self._pagesize = pagesize
1193
1194 def __len__(self):
1195 # This is only useful for tests
1196 return len(self.getslice())
1197
1198 def getslice(self, start=0, end=None):
1199 res = []
1200 for pagenum in itertools.count(start // self._pagesize):
1201 firstid = pagenum * self._pagesize
1202 nextfirstid = pagenum * self._pagesize + self._pagesize
1203 if start >= nextfirstid:
1204 continue
1205
1206 page_results = list(self._pagefunc(pagenum))
1207
1208 startv = (
1209 start % self._pagesize
1210 if firstid <= start < nextfirstid
1211 else 0)
1212
1213 endv = (
1214 ((end - 1) % self._pagesize) + 1
1215 if (end is not None and firstid <= end <= nextfirstid)
1216 else None)
1217
1218 if startv != 0 or endv is not None:
1219 page_results = page_results[startv:endv]
1220 res.extend(page_results)
1221
1222 # A little optimization - if current page is not "full", ie. does
1223 # not contain page_size videos then we can assume that this page
1224 # is the last one - there are no more ids on further pages -
1225 # i.e. no need to query again.
1226 if len(page_results) + startv < self._pagesize:
1227 break
1228
1229 # If we got the whole page, but the next page is not interesting,
1230 # break out early as well
1231 if end == nextfirstid:
1232 break
1233 return res
1234
1235
1236 def uppercase_escape(s):
1237 return re.sub(
1238 r'\\U([0-9a-fA-F]{8})',
1239 lambda m: compat_chr(int(m.group(1), base=16)), s)
1240
1241 try:
1242 struct.pack(u'!I', 0)
1243 except TypeError:
1244 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1245 def struct_pack(spec, *args):
1246 if isinstance(spec, compat_str):
1247 spec = spec.encode('ascii')
1248 return struct.pack(spec, *args)
1249
1250 def struct_unpack(spec, *args):
1251 if isinstance(spec, compat_str):
1252 spec = spec.encode('ascii')
1253 return struct.unpack(spec, *args)
1254 else:
1255 struct_pack = struct.pack
1256 struct_unpack = struct.unpack
1257
1258
1259 def read_batch_urls(batch_fd):
1260 def fixup(url):
1261 if not isinstance(url, compat_str):
1262 url = url.decode('utf-8', 'replace')
1263 BOM_UTF8 = u'\xef\xbb\xbf'
1264 if url.startswith(BOM_UTF8):
1265 url = url[len(BOM_UTF8):]
1266 url = url.strip()
1267 if url.startswith(('#', ';', ']')):
1268 return False
1269 return url
1270
1271 with contextlib.closing(batch_fd) as fd:
1272 return [url for url in map(fixup, fd) if url]
1273
1274
1275 def urlencode_postdata(*args, **kargs):
1276 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1277
1278
1279 def parse_xml(s):
1280 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1281 def doctype(self, name, pubid, system):
1282 pass # Ignore doctypes
1283
1284 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1285 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1286 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1287
1288
1289 if sys.version_info < (3, 0) and sys.platform == 'win32':
1290 def compat_getpass(prompt, *args, **kwargs):
1291 if isinstance(prompt, compat_str):
1292 prompt = prompt.encode(preferredencoding())
1293 return getpass.getpass(prompt, *args, **kwargs)
1294 else:
1295 compat_getpass = getpass.getpass
1296
1297
1298 US_RATINGS = {
1299 'G': 0,
1300 'PG': 10,
1301 'PG-13': 13,
1302 'R': 16,
1303 'NC': 18,
1304 }